mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-24 16:59:16 +08:00
add and update samples with CUDA 11.3 support
This commit is contained in:
parent
067cb65523
commit
568b39bd5b
67
README.md
67
README.md
|
@ -1,11 +1,17 @@
|
||||||
# CUDA Samples
|
# CUDA Samples
|
||||||
|
|
||||||
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads).
|
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads).
|
||||||
|
|
||||||
## Release Notes
|
## Release Notes
|
||||||
|
|
||||||
This section describes the release notes for the CUDA Samples on GitHub only.
|
This section describes the release notes for the CUDA Samples on GitHub only.
|
||||||
|
|
||||||
|
### CUDA 11.3
|
||||||
|
* Added `streamOrderedAllocationIPC`. Demonstrates Inter Process Communication using one process per GPU for computation.
|
||||||
|
* Added `simpleCUBLAS_LU`. Demonstrates batched matrix LU decomposition using cuBLAS API `cublas<t>getrfBatched()`
|
||||||
|
* Updated `simpleVulkan`. Demonstrates use of timeline semaphore.
|
||||||
|
* Updated multiple samples to use pinned memory using `cudaMallocHost()`.
|
||||||
|
|
||||||
### CUDA 11.2
|
### CUDA 11.2
|
||||||
* Added `streamOrderedAllocation`. Demonstrates stream ordered memory allocation on a GPU using cudaMallocAsync and cudaMemPool family of APIs.
|
* Added `streamOrderedAllocation`. Demonstrates stream ordered memory allocation on a GPU using cudaMallocAsync and cudaMemPool family of APIs.
|
||||||
* Added `streamOrderedAllocationP2P`. Demonstrates peer-to-peer access of stream ordered memory allocated using cudaMallocAsync and cudaMemPool family of APIs.
|
* Added `streamOrderedAllocationP2P`. Demonstrates peer-to-peer access of stream ordered memory allocated using cudaMallocAsync and cudaMemPool family of APIs.
|
||||||
|
@ -103,7 +109,7 @@ This is the first release of CUDA Samples on GitHub:
|
||||||
|
|
||||||
### Prerequisites
|
### Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).
|
For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).
|
||||||
|
|
||||||
### Getting the CUDA Samples
|
### Getting the CUDA Samples
|
||||||
|
@ -160,38 +166,39 @@ The samples makefiles can take advantage of certain options:
|
||||||
### Samples by OS
|
### Samples by OS
|
||||||
|
|
||||||
#### Linux
|
#### Linux
|
||||||
**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** |
|
**[bandwidthTest](./Samples/bandwidthTest)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** |
|
||||||
---|---|---|---|
|
---|---|---|---|
|
||||||
**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** |
|
**[boxFilterNPP](./Samples/boxFilterNPP)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
|
||||||
**[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** |
|
**[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[cudaNvSci](./Samples/cudaNvSci)** |
|
||||||
**[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[cudaNvSciNvMedia](./Samples/cudaNvSciNvMedia)** | **[nvJPEG](./Samples/nvJPEG)** |
|
**[cudaNvSciNvMedia](./Samples/cudaNvSciNvMedia)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** |
|
||||||
**[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** |
|
**[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[deviceQuery](./Samples/deviceQuery)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** |
|
||||||
**[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleAttributes](./Samples/simpleAttributes)** |
|
**[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[matrixMul](./Samples/matrixMul)** |
|
||||||
**[cudaNvSci](./Samples/cudaNvSci)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** |
|
**[matrixMulDrv](./Samples/matrixMulDrv)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** |
|
||||||
**[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** |
|
**[nvJPEG](./Samples/nvJPEG)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[reduction](./Samples/reduction)** |
|
||||||
**[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** |
|
**[shfl_scan](./Samples/shfl_scan)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** |
|
||||||
**[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** |
|
**[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[simpleCUFFT](./Samples/simpleCUFFT)** |
|
||||||
**[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** |
|
**[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[simpleGL](./Samples/simpleGL)** | **[simpleIPC](./Samples/simpleIPC)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** |
|
||||||
**[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[simpleGL](./Samples/simpleGL)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** |
|
**[simpleVulkan](./Samples/simpleVulkan)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** |
|
||||||
**[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** |
|
**[streamOrderedAllocationIPC](./Samples/streamOrderedAllocationIPC)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** |
|
||||||
**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** |
|
**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** |
|
||||||
|
**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** |
|
||||||
|
|
||||||
#### Windows
|
#### Windows
|
||||||
**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** |
|
**[bandwidthTest](./Samples/bandwidthTest)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** |
|
||||||
---|---|---|---|
|
---|---|---|---|
|
||||||
**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** |
|
**[boxFilterNPP](./Samples/boxFilterNPP)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
|
||||||
**[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
|
**[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[cudaOpenMP](./Samples/cudaOpenMP)** |
|
||||||
**[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[nvJPEG](./Samples/nvJPEG)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[simpleD3D12](./Samples/simpleD3D12)** |
|
**[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[deviceQuery](./Samples/deviceQuery)** |
|
||||||
**[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[deviceQuery](./Samples/deviceQuery)** |
|
**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** |
|
||||||
**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** |
|
**[matrixMul](./Samples/matrixMul)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** |
|
||||||
**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** |
|
**[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[nvJPEG](./Samples/nvJPEG)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** |
|
||||||
**[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** |
|
**[reduction](./Samples/reduction)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** |
|
||||||
**[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** |
|
**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** |
|
||||||
**[simpleD3D11](./Samples/simpleD3D11)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** |
|
**[simpleCUFFT](./Samples/simpleCUFFT)** | **[simpleD3D11](./Samples/simpleD3D11)** | **[simpleD3D12](./Samples/simpleD3D12)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** |
|
||||||
**[bandwidthTest](./Samples/bandwidthTest)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** |
|
**[simpleGL](./Samples/simpleGL)** | **[simpleIPC](./Samples/simpleIPC)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleVulkan](./Samples/simpleVulkan)** |
|
||||||
**[simpleGL](./Samples/simpleGL)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[simpleVulkan](./Samples/simpleVulkan)** |
|
**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** |
|
||||||
**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** |
|
**[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** |
|
||||||
**[cudaOpenMP](./Samples/cudaOpenMP)** | **[matrixMul](./Samples/matrixMul)** |
|
**[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** |
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
|
|
||||||
|
|
|
@ -285,6 +285,12 @@ ifeq ($(TARGET_OS),android)
|
||||||
SAMPLE_ENABLED := 0
|
SAMPLE_ENABLED := 0
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# This sample is not supported on QNX
|
||||||
|
ifeq ($(TARGET_OS),qnx)
|
||||||
|
$(info >>> WARNING - EGLStream_CUDA_Interop is not supported on QNX - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
ALL_LDFLAGS :=
|
ALL_LDFLAGS :=
|
||||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||||
|
|
|
@ -30,7 +30,7 @@ cuDeviceGet, cuDeviceGetAttribute, cuDeviceComputeCapability, cuDeviceGetCount,
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -263,6 +263,14 @@ ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
|
||||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
|
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
|
||||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||||
|
|
||||||
|
SAMPLE_ENABLED := 1
|
||||||
|
|
||||||
|
# This sample is not supported on QNX
|
||||||
|
ifeq ($(TARGET_OS),qnx)
|
||||||
|
$(info >>> WARNING - MersenneTwisterGP11213 is not supported on QNX - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
ALL_LDFLAGS :=
|
ALL_LDFLAGS :=
|
||||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||||
|
@ -297,6 +305,10 @@ ALL_CCFLAGS += --threads 0
|
||||||
|
|
||||||
LIBRARIES += -lcurand_static -lculibos
|
LIBRARIES += -lcurand_static -lculibos
|
||||||
|
|
||||||
|
ifeq ($(SAMPLE_ENABLED),0)
|
||||||
|
EXEC ?= @echo "[@]"
|
||||||
|
endif
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
# Target rules
|
# Target rules
|
||||||
|
@ -304,16 +316,23 @@ all: build
|
||||||
|
|
||||||
build: MersenneTwisterGP11213
|
build: MersenneTwisterGP11213
|
||||||
|
|
||||||
|
check.deps:
|
||||||
|
ifeq ($(SAMPLE_ENABLED),0)
|
||||||
|
@echo "Sample will be waived due to the above missing dependencies"
|
||||||
|
else
|
||||||
|
@echo "Sample is ready - all dependencies have been met"
|
||||||
|
endif
|
||||||
|
|
||||||
MersenneTwister.o:MersenneTwister.cpp
|
MersenneTwister.o:MersenneTwister.cpp
|
||||||
$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
|
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
|
||||||
|
|
||||||
MersenneTwisterGP11213: MersenneTwister.o
|
MersenneTwisterGP11213: MersenneTwister.o
|
||||||
$(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
|
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
|
||||||
mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||||
cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||||
|
|
||||||
run: build
|
run: build
|
||||||
./MersenneTwisterGP11213
|
$(EXEC) ./MersenneTwisterGP11213
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f MersenneTwisterGP11213 MersenneTwister.o
|
rm -f MersenneTwisterGP11213 MersenneTwister.o
|
||||||
|
|
|
@ -53,8 +53,7 @@ const unsigned int DEFAULT_SEED = 777;
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// Main program
|
// Main program
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv) {
|
||||||
{
|
|
||||||
// Start logs
|
// Start logs
|
||||||
printf("%s Starting...\n\n", argv[0]);
|
printf("%s Starting...\n\n", argv[0]);
|
||||||
|
|
||||||
|
@ -65,8 +64,7 @@ int main(int argc, char **argv)
|
||||||
// parsing the number of random numbers to generate
|
// parsing the number of random numbers to generate
|
||||||
int rand_n = DEFAULT_RAND_N;
|
int rand_n = DEFAULT_RAND_N;
|
||||||
|
|
||||||
if (checkCmdLineFlag(argc, (const char **) argv, "count"))
|
if (checkCmdLineFlag(argc, (const char **)argv, "count")) {
|
||||||
{
|
|
||||||
rand_n = getCmdLineArgumentInt(argc, (const char **)argv, "count");
|
rand_n = getCmdLineArgumentInt(argc, (const char **)argv, "count");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,8 +73,7 @@ int main(int argc, char **argv)
|
||||||
// parsing the seed
|
// parsing the seed
|
||||||
int seed = DEFAULT_SEED;
|
int seed = DEFAULT_SEED;
|
||||||
|
|
||||||
if (checkCmdLineFlag(argc, (const char **) argv, "seed"))
|
if (checkCmdLineFlag(argc, (const char **)argv, "seed")) {
|
||||||
{
|
|
||||||
seed = getCmdLineArgumentInt(argc, (const char **)argv, "seed");
|
seed = getCmdLineArgumentInt(argc, (const char **)argv, "seed");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -94,19 +91,21 @@ int main(int argc, char **argv)
|
||||||
checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngGPU, seed));
|
checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngGPU, seed));
|
||||||
|
|
||||||
curandGenerator_t prngCPU;
|
curandGenerator_t prngCPU;
|
||||||
checkCudaErrors(curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32));
|
checkCudaErrors(
|
||||||
|
curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32));
|
||||||
checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngCPU, seed));
|
checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngCPU, seed));
|
||||||
|
|
||||||
//
|
//
|
||||||
// Example 1: Compare random numbers generated on GPU and CPU
|
// Example 1: Compare random numbers generated on GPU and CPU
|
||||||
float *h_RandGPU = (float *)malloc(rand_n * sizeof(float));
|
float *h_RandGPU;
|
||||||
|
checkCudaErrors(cudaMallocHost(&h_RandGPU, rand_n * sizeof(float)));
|
||||||
|
|
||||||
printf("Generating random numbers on GPU...\n\n");
|
printf("Generating random numbers on GPU...\n\n");
|
||||||
checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n));
|
checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n));
|
||||||
|
|
||||||
printf("\nReading back the results...\n");
|
printf("\nReading back the results...\n");
|
||||||
checkCudaErrors(cudaMemcpyAsync(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost, stream));
|
checkCudaErrors(cudaMemcpyAsync(h_RandGPU, d_Rand, rand_n * sizeof(float),
|
||||||
|
cudaMemcpyDeviceToHost, stream));
|
||||||
|
|
||||||
float *h_RandCPU = (float *)malloc(rand_n * sizeof(float));
|
float *h_RandCPU = (float *)malloc(rand_n * sizeof(float));
|
||||||
|
|
||||||
|
@ -127,8 +126,7 @@ int main(int argc, char **argv)
|
||||||
sdkResetTimer(&hTimer);
|
sdkResetTimer(&hTimer);
|
||||||
sdkStartTimer(&hTimer);
|
sdkStartTimer(&hTimer);
|
||||||
|
|
||||||
for (i = 0; i < numIterations; i++)
|
for (i = 0; i < numIterations; i++) {
|
||||||
{
|
|
||||||
checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n));
|
checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -137,7 +135,9 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer) / (double)numIterations;
|
double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer) / (double)numIterations;
|
||||||
|
|
||||||
printf("MersenneTwisterGP11213, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n",
|
printf(
|
||||||
|
"MersenneTwisterGP11213, Throughput = %.4f GNumbers/s, Time = %.5f s, "
|
||||||
|
"Size = %u Numbers\n",
|
||||||
1.0e-9 * rand_n / gpuTime, gpuTime, rand_n);
|
1.0e-9 * rand_n / gpuTime, gpuTime, rand_n);
|
||||||
|
|
||||||
printf("Shutting down...\n");
|
printf("Shutting down...\n");
|
||||||
|
@ -147,31 +147,27 @@ int main(int argc, char **argv)
|
||||||
checkCudaErrors(cudaStreamDestroy(stream));
|
checkCudaErrors(cudaStreamDestroy(stream));
|
||||||
checkCudaErrors(cudaFree(d_Rand));
|
checkCudaErrors(cudaFree(d_Rand));
|
||||||
sdkDeleteTimer(&hTimer);
|
sdkDeleteTimer(&hTimer);
|
||||||
free(h_RandGPU);
|
checkCudaErrors(cudaFreeHost(h_RandGPU));
|
||||||
free(h_RandCPU);
|
free(h_RandCPU);
|
||||||
|
|
||||||
exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU) {
|
||||||
float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU)
|
|
||||||
{
|
|
||||||
int i;
|
int i;
|
||||||
float rCPU, rGPU, delta;
|
float rCPU, rGPU, delta;
|
||||||
float max_delta = 0.;
|
float max_delta = 0.;
|
||||||
float sum_delta = 0.;
|
float sum_delta = 0.;
|
||||||
float sum_ref = 0.;
|
float sum_ref = 0.;
|
||||||
|
|
||||||
for (i = 0; i < rand_n; i++)
|
for (i = 0; i < rand_n; i++) {
|
||||||
{
|
|
||||||
rCPU = h_RandCPU[i];
|
rCPU = h_RandCPU[i];
|
||||||
rGPU = h_RandGPU[i];
|
rGPU = h_RandGPU[i];
|
||||||
delta = fabs(rCPU - rGPU);
|
delta = fabs(rCPU - rGPU);
|
||||||
sum_delta += delta;
|
sum_delta += delta;
|
||||||
sum_ref += fabs(rCPU);
|
sum_ref += fabs(rCPU);
|
||||||
|
|
||||||
if (delta >= max_delta)
|
if (delta >= max_delta) {
|
||||||
{
|
|
||||||
max_delta = delta;
|
max_delta = delta;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -113,6 +113,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -109,6 +109,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -27,7 +27,7 @@ cudaMemcpy2D, cudaMallocManaged
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ cudaMallocManaged, cudaStreamAttachMemAsync, cudaMemcpyAsync, cudaMallocHost, cu
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -111,6 +111,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -107,6 +107,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -27,7 +27,7 @@ cudaSetDevice, cudaHostAlloc, cudaFree, cudaMallocHost, cudaFreeHost, cudaMemcpy
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -271,6 +271,12 @@ ifeq ($(TARGET_OS),darwin)
|
||||||
SAMPLE_ENABLED := 0
|
SAMPLE_ENABLED := 0
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# This sample is not supported on QNX
|
||||||
|
ifeq ($(TARGET_OS),qnx)
|
||||||
|
$(info >>> WARNING - batchedLabelMarkersAndLabelCompressionNPP is not supported on QNX - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
ALL_LDFLAGS :=
|
ALL_LDFLAGS :=
|
||||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||||
|
|
|
@ -28,7 +28,7 @@ x86_64, ppc64le, armv7l
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -36,7 +36,9 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_string.h>
|
||||||
#include <npp.h>
|
#include <npp.h>
|
||||||
|
|
||||||
// Note: If you want to view these images we HIGHLY recommend using imagej
|
// Note: If you want to view these images we HIGHLY recommend using imagej
|
||||||
|
@ -102,11 +104,12 @@ void tearDown() // Clean up and tear down
|
||||||
if (pUFBatchSrcDstImageListDev != 0) cudaFree(pUFBatchSrcDstImageListDev);
|
if (pUFBatchSrcDstImageListDev != 0) cudaFree(pUFBatchSrcDstImageListDev);
|
||||||
if (pUFBatchSrcImageListDev != 0) cudaFree(pUFBatchSrcImageListDev);
|
if (pUFBatchSrcImageListDev != 0) cudaFree(pUFBatchSrcImageListDev);
|
||||||
if (pUFBatchPerImageCompressedCountListHost != 0)
|
if (pUFBatchPerImageCompressedCountListHost != 0)
|
||||||
free(pUFBatchPerImageCompressedCountListHost);
|
cudaFreeHost(pUFBatchPerImageCompressedCountListHost);
|
||||||
if (pUFBatchSrcDstScratchBufferListHost != 0)
|
if (pUFBatchSrcDstScratchBufferListHost != 0)
|
||||||
free(pUFBatchSrcDstScratchBufferListHost);
|
cudaFreeHost(pUFBatchSrcDstScratchBufferListHost);
|
||||||
if (pUFBatchSrcDstImageListHost != 0) free(pUFBatchSrcDstImageListHost);
|
if (pUFBatchSrcDstImageListHost != 0)
|
||||||
if (pUFBatchSrcImageListHost != 0) free(pUFBatchSrcImageListHost);
|
cudaFreeHost(pUFBatchSrcDstImageListHost);
|
||||||
|
if (pUFBatchSrcImageListHost != 0) cudaFreeHost(pUFBatchSrcImageListHost);
|
||||||
|
|
||||||
for (int j = 0; j < NUMBER_OF_IMAGES; j++) {
|
for (int j = 0; j < NUMBER_OF_IMAGES; j++) {
|
||||||
if (pUFCompressedLabelsScratchBufferDev[j] != 0)
|
if (pUFCompressedLabelsScratchBufferDev[j] != 0)
|
||||||
|
@ -115,8 +118,8 @@ void tearDown() // Clean up and tear down
|
||||||
cudaFree(pUFGenerateLabelsScratchBufferDev[j]);
|
cudaFree(pUFGenerateLabelsScratchBufferDev[j]);
|
||||||
if (pUFLabelDev[j] != 0) cudaFree(pUFLabelDev[j]);
|
if (pUFLabelDev[j] != 0) cudaFree(pUFLabelDev[j]);
|
||||||
if (pInputImageDev[j] != 0) cudaFree(pInputImageDev[j]);
|
if (pInputImageDev[j] != 0) cudaFree(pInputImageDev[j]);
|
||||||
if (pUFLabelHost[j] != 0) free(pUFLabelHost[j]);
|
if (pUFLabelHost[j] != 0) cudaFreeHost(pUFLabelHost[j]);
|
||||||
if (pInputImageHost[j] != 0) free(pInputImageHost[j]);
|
if (pInputImageHost[j] != 0) cudaFreeHost(pInputImageHost[j]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -177,7 +180,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
}
|
||||||
|
|
||||||
bmpFile = fopen(InputFile, "rb");
|
FOPEN(bmpFile, InputFile, "rb");
|
||||||
} else if (nImage == 1) {
|
} else if (nImage == 1) {
|
||||||
if (nWidth != 512 || nHeight != 512) return -1;
|
if (nWidth != 512 || nHeight != 512) return -1;
|
||||||
const char *fileName = "CT_skull_512x512_8u.raw";
|
const char *fileName = "CT_skull_512x512_8u.raw";
|
||||||
|
@ -187,7 +190,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
}
|
||||||
|
|
||||||
bmpFile = fopen(InputFile, "rb");
|
FOPEN(bmpFile, InputFile, "rb");
|
||||||
} else if (nImage == 2) {
|
} else if (nImage == 2) {
|
||||||
if (nWidth != 509 || nHeight != 335) return -1;
|
if (nWidth != 509 || nHeight != 335) return -1;
|
||||||
const char *fileName = "PCB_METAL_509x335_8u.raw";
|
const char *fileName = "PCB_METAL_509x335_8u.raw";
|
||||||
|
@ -197,7 +200,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
}
|
||||||
|
|
||||||
bmpFile = fopen(InputFile, "rb");
|
FOPEN(bmpFile, InputFile, "rb");
|
||||||
} else if (nImage == 3) {
|
} else if (nImage == 3) {
|
||||||
if (nWidth != 1024 || nHeight != 683) return -1;
|
if (nWidth != 1024 || nHeight != 683) return -1;
|
||||||
const char *fileName = "PCB2_1024x683_8u.raw";
|
const char *fileName = "PCB2_1024x683_8u.raw";
|
||||||
|
@ -207,7 +210,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
}
|
||||||
|
|
||||||
bmpFile = fopen(InputFile, "rb");
|
FOPEN(bmpFile, InputFile, "rb");
|
||||||
} else if (nImage == 4) {
|
} else if (nImage == 4) {
|
||||||
if (nWidth != 1280 || nHeight != 720) return -1;
|
if (nWidth != 1280 || nHeight != 720) return -1;
|
||||||
const char *fileName = "PCB_1280x720_8u.raw";
|
const char *fileName = "PCB_1280x720_8u.raw";
|
||||||
|
@ -217,7 +220,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
}
|
||||||
|
|
||||||
bmpFile = fopen(InputFile, "rb");
|
FOPEN(bmpFile, InputFile, "rb");
|
||||||
} else {
|
} else {
|
||||||
printf("Input file load failed.\n");
|
printf("Input file load failed.\n");
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -347,9 +350,11 @@ int main(int argc, char **argv) {
|
||||||
oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height);
|
oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height);
|
||||||
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
||||||
|
|
||||||
pInputImageHost[nImage] = reinterpret_cast<Npp8u *>(malloc(
|
checkCudaErrors(cudaMallocHost(
|
||||||
|
&(pInputImageHost[nImage]),
|
||||||
oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height));
|
oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height));
|
||||||
pUFLabelHost[nImage] = reinterpret_cast<Npp32u *>(malloc(
|
checkCudaErrors(cudaMallocHost(
|
||||||
|
&(pUFLabelHost[nImage]),
|
||||||
oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height));
|
oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height));
|
||||||
|
|
||||||
// Use UF functions throughout this sample.
|
// Use UF functions throughout this sample.
|
||||||
|
@ -409,15 +414,15 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nImage == 0)
|
if (nImage == 0)
|
||||||
bmpFile = fopen(LabelMarkersOutputFile0.c_str(), "wb");
|
FOPEN(bmpFile, LabelMarkersOutputFile0.c_str(), "wb");
|
||||||
else if (nImage == 1)
|
else if (nImage == 1)
|
||||||
bmpFile = fopen(LabelMarkersOutputFile1.c_str(), "wb");
|
FOPEN(bmpFile, LabelMarkersOutputFile1.c_str(), "wb");
|
||||||
else if (nImage == 2)
|
else if (nImage == 2)
|
||||||
bmpFile = fopen(LabelMarkersOutputFile2.c_str(), "wb");
|
FOPEN(bmpFile, LabelMarkersOutputFile2.c_str(), "wb");
|
||||||
else if (nImage == 3)
|
else if (nImage == 3)
|
||||||
bmpFile = fopen(LabelMarkersOutputFile3.c_str(), "wb");
|
FOPEN(bmpFile, LabelMarkersOutputFile3.c_str(), "wb");
|
||||||
else if (nImage == 4)
|
else if (nImage == 4)
|
||||||
bmpFile = fopen(LabelMarkersOutputFile4.c_str(), "wb");
|
FOPEN(bmpFile, LabelMarkersOutputFile4.c_str(), "wb");
|
||||||
|
|
||||||
if (bmpFile == NULL) return -1;
|
if (bmpFile == NULL) return -1;
|
||||||
size_t nSize = 0;
|
size_t nSize = 0;
|
||||||
|
@ -478,15 +483,15 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nImage == 0)
|
if (nImage == 0)
|
||||||
bmpFile = fopen(CompressedMarkerLabelsOutputFile0.c_str(), "wb");
|
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile0.c_str(), "wb");
|
||||||
else if (nImage == 1)
|
else if (nImage == 1)
|
||||||
bmpFile = fopen(CompressedMarkerLabelsOutputFile1.c_str(), "wb");
|
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile1.c_str(), "wb");
|
||||||
else if (nImage == 2)
|
else if (nImage == 2)
|
||||||
bmpFile = fopen(CompressedMarkerLabelsOutputFile2.c_str(), "wb");
|
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile2.c_str(), "wb");
|
||||||
else if (nImage == 3)
|
else if (nImage == 3)
|
||||||
bmpFile = fopen(CompressedMarkerLabelsOutputFile3.c_str(), "wb");
|
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile3.c_str(), "wb");
|
||||||
else if (nImage == 4)
|
else if (nImage == 4)
|
||||||
bmpFile = fopen(CompressedMarkerLabelsOutputFile4.c_str(), "wb");
|
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile4.c_str(), "wb");
|
||||||
|
|
||||||
if (bmpFile == NULL) return -1;
|
if (bmpFile == NULL) return -1;
|
||||||
nSize = 0;
|
nSize = 0;
|
||||||
|
@ -554,10 +559,11 @@ int main(int argc, char **argv) {
|
||||||
cudaMalloc((void **)&pUFBatchSrcDstImageListDev, nBatchImageListBytes);
|
cudaMalloc((void **)&pUFBatchSrcDstImageListDev, nBatchImageListBytes);
|
||||||
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
||||||
|
|
||||||
pUFBatchSrcImageListHost =
|
checkCudaErrors(
|
||||||
reinterpret_cast<NppiImageDescriptor *>(malloc(nBatchImageListBytes));
|
cudaMallocHost((void **)&pUFBatchSrcImageListHost, nBatchImageListBytes));
|
||||||
pUFBatchSrcDstImageListHost =
|
|
||||||
reinterpret_cast<NppiImageDescriptor *>(malloc(nBatchImageListBytes));
|
checkCudaErrors(cudaMallocHost((void **)&pUFBatchSrcDstImageListHost,
|
||||||
|
nBatchImageListBytes));
|
||||||
|
|
||||||
NppiSize oMaxROISize = {0, 0};
|
NppiSize oMaxROISize = {0, 0};
|
||||||
|
|
||||||
|
@ -620,15 +626,15 @@ int main(int argc, char **argv) {
|
||||||
// Save output to files
|
// Save output to files
|
||||||
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
|
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
|
||||||
if (nImage == 0)
|
if (nImage == 0)
|
||||||
bmpFile = fopen(LabelMarkersBatchOutputFile0.c_str(), "wb");
|
FOPEN(bmpFile, LabelMarkersBatchOutputFile0.c_str(), "wb");
|
||||||
else if (nImage == 1)
|
else if (nImage == 1)
|
||||||
bmpFile = fopen(LabelMarkersBatchOutputFile1.c_str(), "wb");
|
FOPEN(bmpFile, LabelMarkersBatchOutputFile1.c_str(), "wb");
|
||||||
else if (nImage == 2)
|
else if (nImage == 2)
|
||||||
bmpFile = fopen(LabelMarkersBatchOutputFile2.c_str(), "wb");
|
FOPEN(bmpFile, LabelMarkersBatchOutputFile2.c_str(), "wb");
|
||||||
else if (nImage == 3)
|
else if (nImage == 3)
|
||||||
bmpFile = fopen(LabelMarkersBatchOutputFile3.c_str(), "wb");
|
FOPEN(bmpFile, LabelMarkersBatchOutputFile3.c_str(), "wb");
|
||||||
else if (nImage == 4)
|
else if (nImage == 4)
|
||||||
bmpFile = fopen(LabelMarkersBatchOutputFile4.c_str(), "wb");
|
FOPEN(bmpFile, LabelMarkersBatchOutputFile4.c_str(), "wb");
|
||||||
|
|
||||||
if (bmpFile == NULL) return -1;
|
if (bmpFile == NULL) return -1;
|
||||||
size_t nSize = 0;
|
size_t nSize = 0;
|
||||||
|
@ -652,12 +658,13 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
// Allocate host side scratch buffer point and size list and initialize with
|
// Allocate host side scratch buffer point and size list and initialize with
|
||||||
// device scratch buffer pointers
|
// device scratch buffer pointers
|
||||||
pUFBatchSrcDstScratchBufferListHost =
|
checkCudaErrors(
|
||||||
reinterpret_cast<NppiBufferDescriptor *>(
|
cudaMallocHost((void **)&pUFBatchSrcDstScratchBufferListHost,
|
||||||
malloc(NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor)));
|
NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor)));
|
||||||
|
|
||||||
pUFBatchPerImageCompressedCountListHost =
|
checkCudaErrors(
|
||||||
reinterpret_cast<Npp32u *>(malloc(NUMBER_OF_IMAGES * sizeof(Npp32u)));
|
cudaMallocHost((void **)&pUFBatchPerImageCompressedCountListHost,
|
||||||
|
+NUMBER_OF_IMAGES * sizeof(Npp32u)));
|
||||||
|
|
||||||
// Start buffer pointer at beginning of full per image buffer list sized
|
// Start buffer pointer at beginning of full per image buffer list sized
|
||||||
// pUFCompressedLabelsScratchBufferDev[0]
|
// pUFCompressedLabelsScratchBufferDev[0]
|
||||||
|
@ -728,15 +735,15 @@ int main(int argc, char **argv) {
|
||||||
// Save compressed label images into files
|
// Save compressed label images into files
|
||||||
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
|
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
|
||||||
if (nImage == 0)
|
if (nImage == 0)
|
||||||
bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile0.c_str(), "wb");
|
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile0.c_str(), "wb");
|
||||||
else if (nImage == 1)
|
else if (nImage == 1)
|
||||||
bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile1.c_str(), "wb");
|
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile1.c_str(), "wb");
|
||||||
else if (nImage == 2)
|
else if (nImage == 2)
|
||||||
bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile2.c_str(), "wb");
|
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile2.c_str(), "wb");
|
||||||
else if (nImage == 3)
|
else if (nImage == 3)
|
||||||
bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile3.c_str(), "wb");
|
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile3.c_str(), "wb");
|
||||||
else if (nImage == 4)
|
else if (nImage == 4)
|
||||||
bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile4.c_str(), "wb");
|
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile4.c_str(), "wb");
|
||||||
|
|
||||||
if (bmpFile == NULL) return -1;
|
if (bmpFile == NULL) return -1;
|
||||||
size_t nSize = 0;
|
size_t nSize = 0;
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -31,14 +31,16 @@
|
||||||
* 1.) Each thread loads a value from random array.
|
* 1.) Each thread loads a value from random array.
|
||||||
* 2.) then checks if it is odd or even.
|
* 2.) then checks if it is odd or even.
|
||||||
* 3.) create binary partition group based on the above predicate
|
* 3.) create binary partition group based on the above predicate
|
||||||
* 4.) we count the number of odd/even in the group based on size of the binary groups
|
* 4.) we count the number of odd/even in the group based on size of the binary
|
||||||
|
groups
|
||||||
* 5.) write it global counter of odd.
|
* 5.) write it global counter of odd.
|
||||||
* 6.) sum the values loaded by individual threads(using reduce) and write it to global
|
* 6.) sum the values loaded by individual threads(using reduce) and write it to
|
||||||
* even & odd elements sum.
|
global even & odd elements sum.
|
||||||
*
|
*
|
||||||
* **NOTE** : binary_partition results in splitting warp into divergent thread groups
|
* **NOTE** :
|
||||||
this is not good from performance perspective, but in cases where warp
|
* binary_partition results in splitting warp into divergent thread groups
|
||||||
divergence is inevitable one can use binary_partition group.
|
* this is not good from performance perspective, but in cases where warp
|
||||||
|
* divergence is inevitable one can use binary_partition group.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
@ -48,50 +50,42 @@
|
||||||
|
|
||||||
namespace cg = cooperative_groups;
|
namespace cg = cooperative_groups;
|
||||||
|
|
||||||
void initOddEvenArr(int *inputArr, unsigned int size)
|
void initOddEvenArr(int *inputArr, unsigned int size) {
|
||||||
{
|
for (int i = 0; i < size; i++) {
|
||||||
for (int i=0; i < size; i++)
|
|
||||||
{
|
|
||||||
inputArr[i] = rand() % 50;
|
inputArr[i] = rand() % 50;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* CUDA kernel device code
|
* CUDA kernel device code
|
||||||
*
|
*
|
||||||
* Creates cooperative groups and performs odd/even counting & summation.
|
* Creates cooperative groups and performs odd/even counting & summation.
|
||||||
*/
|
*/
|
||||||
__global__ void oddEvenCountAndSumCG(int *inputArr, int *numOfOdds, int *sumOfOddAndEvens, unsigned int size)
|
__global__ void oddEvenCountAndSumCG(int *inputArr, int *numOfOdds,
|
||||||
{
|
int *sumOfOddAndEvens, unsigned int size) {
|
||||||
cg::thread_block cta = cg::this_thread_block();
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
cg::grid_group grid = cg::this_grid();
|
cg::grid_group grid = cg::this_grid();
|
||||||
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
|
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
|
||||||
|
|
||||||
for (int i = grid.thread_rank(); i < size; i += grid.size())
|
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
|
||||||
{
|
|
||||||
int elem = inputArr[i];
|
int elem = inputArr[i];
|
||||||
auto subTile = cg::binary_partition(tile32, elem & 1);
|
auto subTile = cg::binary_partition(tile32, elem & 1);
|
||||||
if (elem & 1) // Odd numbers group
|
if (elem & 1) // Odd numbers group
|
||||||
{
|
{
|
||||||
int oddGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
|
int oddGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
|
||||||
|
|
||||||
if (subTile.thread_rank() == 0)
|
if (subTile.thread_rank() == 0) {
|
||||||
{
|
|
||||||
// Add number of odds present in this group of Odds.
|
// Add number of odds present in this group of Odds.
|
||||||
atomicAdd(numOfOdds, subTile.size());
|
atomicAdd(numOfOdds, subTile.size());
|
||||||
|
|
||||||
// Add local reduction of odds present in this group of Odds.
|
// Add local reduction of odds present in this group of Odds.
|
||||||
atomicAdd(&sumOfOddAndEvens[0], oddGroupSum);
|
atomicAdd(&sumOfOddAndEvens[0], oddGroupSum);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
} else // Even numbers group
|
||||||
else // Even numbers group
|
|
||||||
{
|
{
|
||||||
int evenGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
|
int evenGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
|
||||||
|
|
||||||
if (subTile.thread_rank() == 0)
|
if (subTile.thread_rank() == 0) {
|
||||||
{
|
|
||||||
// Add local reduction of even present in this group of evens.
|
// Add local reduction of even present in this group of evens.
|
||||||
atomicAdd(&sumOfOddAndEvens[1], evenGroupSum);
|
atomicAdd(&sumOfOddAndEvens[1], evenGroupSum);
|
||||||
}
|
}
|
||||||
|
@ -102,21 +96,19 @@ __global__ void oddEvenCountAndSumCG(int *inputArr, int *numOfOdds, int *sumOfOd
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Host main routine
|
* Host main routine
|
||||||
*/
|
*/
|
||||||
int main(int argc, const char **argv)
|
int main(int argc, const char **argv) {
|
||||||
{
|
|
||||||
int deviceId = findCudaDevice(argc, argv);
|
int deviceId = findCudaDevice(argc, argv);
|
||||||
int *h_inputArr, *d_inputArr;
|
int *h_inputArr, *d_inputArr;
|
||||||
int *h_numOfOdds, *d_numOfOdds;
|
int *h_numOfOdds, *d_numOfOdds;
|
||||||
int *h_sumOfOddEvenElems, *d_sumOfOddEvenElems;
|
int *h_sumOfOddEvenElems, *d_sumOfOddEvenElems;
|
||||||
unsigned int arrSize = 1024 * 100;
|
unsigned int arrSize = 1024 * 100;
|
||||||
|
|
||||||
h_inputArr = new int[arrSize];
|
checkCudaErrors(cudaMallocHost(&h_inputArr, sizeof(int) * arrSize));
|
||||||
h_numOfOdds = new int[1];
|
checkCudaErrors(cudaMallocHost(&h_numOfOdds, sizeof(int)));
|
||||||
h_sumOfOddEvenElems = new int[2];
|
checkCudaErrors(cudaMallocHost(&h_sumOfOddEvenElems, sizeof(int) * 2));
|
||||||
initOddEvenArr(h_inputArr, arrSize);
|
initOddEvenArr(h_inputArr, arrSize);
|
||||||
|
|
||||||
cudaStream_t stream;
|
cudaStream_t stream;
|
||||||
|
@ -125,27 +117,39 @@ int main(int argc, const char **argv)
|
||||||
checkCudaErrors(cudaMalloc(&d_numOfOdds, sizeof(int)));
|
checkCudaErrors(cudaMalloc(&d_numOfOdds, sizeof(int)));
|
||||||
checkCudaErrors(cudaMalloc(&d_sumOfOddEvenElems, sizeof(int) * 2));
|
checkCudaErrors(cudaMalloc(&d_sumOfOddEvenElems, sizeof(int) * 2));
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_inputArr, h_inputArr, sizeof(int)*arrSize, cudaMemcpyHostToDevice, stream));
|
checkCudaErrors(cudaMemcpyAsync(d_inputArr, h_inputArr, sizeof(int) * arrSize,
|
||||||
|
cudaMemcpyHostToDevice, stream));
|
||||||
checkCudaErrors(cudaMemsetAsync(d_numOfOdds, 0, sizeof(int), stream));
|
checkCudaErrors(cudaMemsetAsync(d_numOfOdds, 0, sizeof(int), stream));
|
||||||
checkCudaErrors(cudaMemsetAsync(d_sumOfOddEvenElems, 0, 2*sizeof(int), stream));
|
checkCudaErrors(
|
||||||
|
cudaMemsetAsync(d_sumOfOddEvenElems, 0, 2 * sizeof(int), stream));
|
||||||
|
|
||||||
// Launch the kernel
|
// Launch the kernel
|
||||||
int threadsPerBlock=1024;
|
int threadsPerBlock = 0;
|
||||||
int blocksPerGrid = arrSize / threadsPerBlock;
|
int blocksPerGrid = 0;
|
||||||
|
checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
|
||||||
|
&blocksPerGrid, &threadsPerBlock, oddEvenCountAndSumCG, 0, 0));
|
||||||
|
|
||||||
printf("\nLaunching %d blocks with %d threads...\n\n",blocksPerGrid, threadsPerBlock);
|
printf("\nLaunching %d blocks with %d threads...\n\n", blocksPerGrid,
|
||||||
|
threadsPerBlock);
|
||||||
|
|
||||||
oddEvenCountAndSumCG<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_inputArr, d_numOfOdds, d_sumOfOddEvenElems, arrSize);
|
oddEvenCountAndSumCG<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
|
||||||
|
d_inputArr, d_numOfOdds, d_sumOfOddEvenElems, arrSize);
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(h_numOfOdds, d_numOfOdds, sizeof(int), cudaMemcpyDeviceToHost, stream));
|
checkCudaErrors(cudaMemcpyAsync(h_numOfOdds, d_numOfOdds, sizeof(int),
|
||||||
checkCudaErrors(cudaMemcpyAsync(h_sumOfOddEvenElems, d_sumOfOddEvenElems, 2*sizeof(int), cudaMemcpyDeviceToHost, stream));
|
cudaMemcpyDeviceToHost, stream));
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(h_sumOfOddEvenElems, d_sumOfOddEvenElems,
|
||||||
|
2 * sizeof(int), cudaMemcpyDeviceToHost,
|
||||||
|
stream));
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
printf("Array size = %d Num of Odds = %d Sum of Odds = %d Sum of Evens %d\n", arrSize, h_numOfOdds[0], h_sumOfOddEvenElems[0], h_sumOfOddEvenElems[1]);
|
printf("Array size = %d Num of Odds = %d Sum of Odds = %d Sum of Evens %d\n",
|
||||||
|
arrSize, h_numOfOdds[0], h_sumOfOddEvenElems[0],
|
||||||
|
h_sumOfOddEvenElems[1]);
|
||||||
printf("\n...Done.\n\n");
|
printf("\n...Done.\n\n");
|
||||||
|
|
||||||
delete[] h_inputArr;
|
checkCudaErrors(cudaFreeHost(h_inputArr));
|
||||||
delete[] h_numOfOdds;
|
checkCudaErrors(cudaFreeHost(h_numOfOdds));
|
||||||
delete[] h_sumOfOddEvenElems;
|
checkCudaErrors(cudaFreeHost(h_sumOfOddEvenElems));
|
||||||
|
|
||||||
checkCudaErrors(cudaFree(d_inputArr));
|
checkCudaErrors(cudaFree(d_inputArr));
|
||||||
checkCudaErrors(cudaFree(d_numOfOdds));
|
checkCudaErrors(cudaFree(d_numOfOdds));
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -118,6 +118,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -114,6 +114,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -118,6 +118,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -114,6 +114,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -265,6 +265,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||||
|
|
||||||
SAMPLE_ENABLED := 1
|
SAMPLE_ENABLED := 1
|
||||||
|
|
||||||
|
# This sample is not supported on QNX
|
||||||
|
ifeq ($(TARGET_OS),qnx)
|
||||||
|
$(info >>> WARNING - conjugateGradientCudaGraphs is not supported on QNX - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
ALL_LDFLAGS :=
|
ALL_LDFLAGS :=
|
||||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||||
|
|
|
@ -30,7 +30,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch,
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -25,7 +25,6 @@
|
||||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This sample implements a conjugate gradient solver on GPU
|
* This sample implements a conjugate gradient solver on GPU
|
||||||
* using CUBLAS and CUSPARSE with CUDA Graphs
|
* using CUBLAS and CUSPARSE with CUDA Graphs
|
||||||
|
@ -46,7 +45,6 @@
|
||||||
#include <helper_cuda.h> // helper function CUDA error checking and initialization
|
#include <helper_cuda.h> // helper function CUDA error checking and initialization
|
||||||
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
|
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
|
||||||
|
|
||||||
|
|
||||||
const char *sSDKname = "conjugateGradientCudaGraphs";
|
const char *sSDKname = "conjugateGradientCudaGraphs";
|
||||||
|
|
||||||
#ifndef WITH_GRAPH
|
#ifndef WITH_GRAPH
|
||||||
|
@ -145,12 +143,12 @@ int main(int argc, char **argv) {
|
||||||
/* Generate a random tridiagonal symmetric matrix in CSR format */
|
/* Generate a random tridiagonal symmetric matrix in CSR format */
|
||||||
N = 1048576;
|
N = 1048576;
|
||||||
nz = (N - 2) * 3 + 4;
|
nz = (N - 2) * 3 + 4;
|
||||||
I = (int *)malloc(sizeof(int) * (N + 1));
|
checkCudaErrors(cudaMallocHost(&I, sizeof(int) * (N + 1)));
|
||||||
J = (int *)malloc(sizeof(int) * nz);
|
checkCudaErrors(cudaMallocHost(&J, sizeof(int) * nz));
|
||||||
val = (float *)malloc(sizeof(float) * nz);
|
checkCudaErrors(cudaMallocHost(&val, sizeof(float) * nz));
|
||||||
genTridiag(I, J, val, N, nz);
|
genTridiag(I, J, val, N, nz);
|
||||||
|
|
||||||
x = (float *)malloc(sizeof(float) * N);
|
checkCudaErrors(cudaMallocHost(&x, sizeof(float) * N));
|
||||||
rhs = (float *)malloc(sizeof(float) * N);
|
rhs = (float *)malloc(sizeof(float) * N);
|
||||||
|
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
|
@ -192,9 +190,9 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
/* Wrap raw data into cuSPARSE generic API objects */
|
/* Wrap raw data into cuSPARSE generic API objects */
|
||||||
cusparseSpMatDescr_t matA = NULL;
|
cusparseSpMatDescr_t matA = NULL;
|
||||||
checkCudaErrors(cusparseCreateCsr(
|
checkCudaErrors(cusparseCreateCsr(&matA, N, N, nz, d_row, d_col, d_val,
|
||||||
&matA, N, N, nz, d_row, d_col, d_val, CUSPARSE_INDEX_32I,
|
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
|
||||||
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
|
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
|
||||||
cusparseDnVecDescr_t vecx = NULL;
|
cusparseDnVecDescr_t vecx = NULL;
|
||||||
checkCudaErrors(cusparseCreateDnVec(&vecx, N, d_x, CUDA_R_32F));
|
checkCudaErrors(cusparseCreateDnVec(&vecx, N, d_x, CUDA_R_32F));
|
||||||
cusparseDnVecDescr_t vecp = NULL;
|
cusparseDnVecDescr_t vecp = NULL;
|
||||||
|
@ -206,7 +204,7 @@ int main(int argc, char **argv) {
|
||||||
size_t bufferSize = 0;
|
size_t bufferSize = 0;
|
||||||
checkCudaErrors(cusparseSpMV_bufferSize(
|
checkCudaErrors(cusparseSpMV_bufferSize(
|
||||||
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx,
|
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx,
|
||||||
&beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize));
|
&beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
|
||||||
void *buffer = NULL;
|
void *buffer = NULL;
|
||||||
checkCudaErrors(cudaMalloc(&buffer, bufferSize));
|
checkCudaErrors(cudaMalloc(&buffer, bufferSize));
|
||||||
|
|
||||||
|
@ -234,9 +232,9 @@ int main(int argc, char **argv) {
|
||||||
beta = 0.0;
|
beta = 0.0;
|
||||||
|
|
||||||
checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
|
checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
|
||||||
checkCudaErrors(cusparseSpMV(
|
checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||||
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx,
|
&alpha, matA, vecx, &beta, vecAx, CUDA_R_32F,
|
||||||
&beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer));
|
CUSPARSE_SPMV_ALG_DEFAULT, buffer));
|
||||||
|
|
||||||
checkCudaErrors(cublasSetStream(cublasHandle, stream1));
|
checkCudaErrors(cublasSetStream(cublasHandle, stream1));
|
||||||
checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1));
|
checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1));
|
||||||
|
@ -248,9 +246,9 @@ int main(int argc, char **argv) {
|
||||||
k = 1;
|
k = 1;
|
||||||
// First Iteration when k=1 starts
|
// First Iteration when k=1 starts
|
||||||
checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_p, 1));
|
checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_p, 1));
|
||||||
checkCudaErrors(cusparseSpMV(
|
checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||||
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp,
|
&alpha, matA, vecp, &beta, vecAx, CUDA_R_32F,
|
||||||
&beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer));
|
CUSPARSE_SPMV_ALG_DEFAULT, buffer));
|
||||||
|
|
||||||
checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
|
checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
|
||||||
|
|
||||||
|
@ -290,9 +288,9 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(
|
||||||
cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST));
|
cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST));
|
||||||
checkCudaErrors(cusparseSpMV(
|
checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||||
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp,
|
&alpha, matA, vecp, &beta, vecAx, CUDA_R_32F,
|
||||||
&beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer));
|
CUSPARSE_SPMV_ALG_DEFAULT, buffer));
|
||||||
|
|
||||||
checkCudaErrors(cudaMemsetAsync(d_dot, 0, sizeof(float), stream1));
|
checkCudaErrors(cudaMemsetAsync(d_dot, 0, sizeof(float), stream1));
|
||||||
checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
|
checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
|
||||||
|
@ -336,7 +334,7 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
checkCudaErrors(cusparseSpMV(
|
checkCudaErrors(cusparseSpMV(
|
||||||
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp,
|
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp,
|
||||||
&beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer));
|
&beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, buffer));
|
||||||
|
|
||||||
cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
|
cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
|
||||||
checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
|
checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
|
||||||
|
@ -395,23 +393,31 @@ int main(int argc, char **argv) {
|
||||||
cusparseDestroy(cusparseHandle);
|
cusparseDestroy(cusparseHandle);
|
||||||
cublasDestroy(cublasHandle);
|
cublasDestroy(cublasHandle);
|
||||||
|
|
||||||
if (matA ) { checkCudaErrors(cusparseDestroySpMat(matA)); }
|
if (matA) {
|
||||||
if (vecx ) { checkCudaErrors(cusparseDestroyDnVec(vecx)); }
|
checkCudaErrors(cusparseDestroySpMat(matA));
|
||||||
if (vecAx ) { checkCudaErrors(cusparseDestroyDnVec(vecAx)); }
|
}
|
||||||
if (vecp ) { checkCudaErrors(cusparseDestroyDnVec(vecp)); }
|
if (vecx) {
|
||||||
|
checkCudaErrors(cusparseDestroyDnVec(vecx));
|
||||||
|
}
|
||||||
|
if (vecAx) {
|
||||||
|
checkCudaErrors(cusparseDestroyDnVec(vecAx));
|
||||||
|
}
|
||||||
|
if (vecp) {
|
||||||
|
checkCudaErrors(cusparseDestroyDnVec(vecp));
|
||||||
|
}
|
||||||
|
|
||||||
free(I);
|
checkCudaErrors(cudaFreeHost(I));
|
||||||
free(J);
|
checkCudaErrors(cudaFreeHost(J));
|
||||||
free(val);
|
checkCudaErrors(cudaFreeHost(val));
|
||||||
free(x);
|
checkCudaErrors(cudaFreeHost(x));
|
||||||
free(rhs);
|
free(rhs);
|
||||||
cudaFree(d_col);
|
checkCudaErrors(cudaFree(d_col));
|
||||||
cudaFree(d_row);
|
checkCudaErrors(cudaFree(d_row));
|
||||||
cudaFree(d_val);
|
checkCudaErrors(cudaFree(d_val));
|
||||||
cudaFree(d_x);
|
checkCudaErrors(cudaFree(d_x));
|
||||||
cudaFree(d_r);
|
checkCudaErrors(cudaFree(d_r));
|
||||||
cudaFree(d_p);
|
checkCudaErrors(cudaFree(d_p));
|
||||||
cudaFree(d_Ax);
|
checkCudaErrors(cudaFree(d_Ax));
|
||||||
|
|
||||||
printf("Test Summary: Error amount = %f\n", err);
|
printf("Test Summary: Error amount = %f\n", err);
|
||||||
exit((k <= max_iter) ? 0 : 1);
|
exit((k <= max_iter) ? 0 : 1);
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -30,7 +30,7 @@ cudaMemAdvise, cudaMemPrefetchAsync, cudaLaunchCooperativeKernelMultiDevice, cud
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -223,7 +223,9 @@ __device__ void gpuDotProduct(float *vecA, float *vecB, int size,
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
|
|
||||||
if (tile32.meta_group_rank() == 0) {
|
if (tile32.meta_group_rank() == 0) {
|
||||||
temp_sum = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
|
temp_sum = tile32.thread_rank() < tile32.meta_group_size()
|
||||||
|
? tmp[tile32.thread_rank()]
|
||||||
|
: 0.0;
|
||||||
temp_sum = cg::reduce(tile32, temp_sum, cg::plus<double>());
|
temp_sum = cg::reduce(tile32, temp_sum, cg::plus<double>());
|
||||||
|
|
||||||
if (tile32.thread_rank() == 0) {
|
if (tile32.thread_rank() == 0) {
|
||||||
|
@ -239,7 +241,8 @@ __device__ void gpuCopyVector(float *srcA, float *destB, int size,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ void gpuScaleVectorAndSaxpy(float *x, float *y, float a, float scale, int size,
|
__device__ void gpuScaleVectorAndSaxpy(float *x, float *y, float a, float scale,
|
||||||
|
int size,
|
||||||
const cg::multi_grid_group &multi_grid) {
|
const cg::multi_grid_group &multi_grid) {
|
||||||
for (int i = multi_grid.thread_rank(); i < size; i += multi_grid.size()) {
|
for (int i = multi_grid.thread_rank(); i < size; i += multi_grid.size()) {
|
||||||
y[i] = a * x[i] + scale * y[i];
|
y[i] = a * x[i] + scale * y[i];
|
||||||
|
@ -360,7 +363,8 @@ std::multimap<std::pair<int, int>, int> getIdenticalGPUs() {
|
||||||
// Filter unsupported devices
|
// Filter unsupported devices
|
||||||
if (deviceProp.cooperativeMultiDeviceLaunch &&
|
if (deviceProp.cooperativeMultiDeviceLaunch &&
|
||||||
deviceProp.concurrentManagedAccess) {
|
deviceProp.concurrentManagedAccess) {
|
||||||
identicalGpus.emplace(std::make_pair(deviceProp.major, deviceProp.minor), i);
|
identicalGpus.emplace(std::make_pair(deviceProp.major, deviceProp.minor),
|
||||||
|
i);
|
||||||
}
|
}
|
||||||
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i,
|
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i,
|
||||||
deviceProp.name, deviceProp.major, deviceProp.minor);
|
deviceProp.name, deviceProp.major, deviceProp.minor);
|
||||||
|
@ -387,15 +391,17 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
auto bestFit = std::make_pair(it, it);
|
auto bestFit = std::make_pair(it, it);
|
||||||
// use std::distance to find the largest number of GPUs amongst architectures
|
// use std::distance to find the largest number of GPUs amongst architectures
|
||||||
auto distance = [](decltype(bestFit) p){return std::distance(p.first, p.second);};
|
auto distance = [](decltype(bestFit) p) {
|
||||||
|
return std::distance(p.first, p.second);
|
||||||
|
};
|
||||||
|
|
||||||
// Read each unique key/pair element in order
|
// Read each unique key/pair element in order
|
||||||
for (; it != end; it = gpusByArch.upper_bound(it->first)) {
|
for (; it != end; it = gpusByArch.upper_bound(it->first)) {
|
||||||
// first and second are iterators bounded within the architecture group
|
// first and second are iterators bounded within the architecture group
|
||||||
auto testFit = gpusByArch.equal_range(it->first);
|
auto testFit = gpusByArch.equal_range(it->first);
|
||||||
// Always use devices with highest architecture version or whichever has the most devices available
|
// Always use devices with highest architecture version or whichever has the
|
||||||
if (distance(bestFit) <= distance(testFit))
|
// most devices available
|
||||||
bestFit = testFit;
|
if (distance(bestFit) <= distance(testFit)) bestFit = testFit;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (distance(bestFit) < kNumGpusRequired) {
|
if (distance(bestFit) < kNumGpusRequired) {
|
||||||
|
@ -408,33 +414,35 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
std::set<int> bestFitDeviceIds;
|
std::set<int> bestFitDeviceIds;
|
||||||
|
|
||||||
// check & select peer-to-peer access capable GPU devices as enabling p2p access between participating
|
// check & select peer-to-peer access capable GPU devices as enabling p2p
|
||||||
|
// access between participating
|
||||||
// GPUs gives better performance for multi_grid sync.
|
// GPUs gives better performance for multi_grid sync.
|
||||||
for (auto itr = bestFit.first; itr != bestFit.second; itr++) {
|
for (auto itr = bestFit.first; itr != bestFit.second; itr++) {
|
||||||
int deviceId = itr->second;
|
int deviceId = itr->second;
|
||||||
checkCudaErrors(cudaSetDevice(deviceId));
|
checkCudaErrors(cudaSetDevice(deviceId));
|
||||||
|
|
||||||
std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds](decltype(*itr) mapPair) {
|
std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds,
|
||||||
if (deviceId != mapPair.second)
|
&kNumGpusRequired](
|
||||||
{
|
decltype(*itr) mapPair) {
|
||||||
|
if (deviceId != mapPair.second) {
|
||||||
int access = 0;
|
int access = 0;
|
||||||
checkCudaErrors(cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second));
|
checkCudaErrors(
|
||||||
printf("Device=%d %s Access Peer Device=%d\n", deviceId, access ? "CAN" : "CANNOT", mapPair.second);
|
cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second));
|
||||||
|
printf("Device=%d %s Access Peer Device=%d\n", deviceId,
|
||||||
|
access ? "CAN" : "CANNOT", mapPair.second);
|
||||||
if (access && bestFitDeviceIds.size() < kNumGpusRequired) {
|
if (access && bestFitDeviceIds.size() < kNumGpusRequired) {
|
||||||
bestFitDeviceIds.emplace(deviceId);
|
bestFitDeviceIds.emplace(deviceId);
|
||||||
bestFitDeviceIds.emplace(mapPair.second);
|
bestFitDeviceIds.emplace(mapPair.second);
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);
|
printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
if (bestFitDeviceIds.size() >= kNumGpusRequired)
|
if (bestFitDeviceIds.size() >= kNumGpusRequired) {
|
||||||
{
|
|
||||||
printf("Selected p2p capable devices - ");
|
printf("Selected p2p capable devices - ");
|
||||||
for (auto devicesItr = bestFitDeviceIds.begin(); devicesItr != bestFitDeviceIds.end(); devicesItr++)
|
for (auto devicesItr = bestFitDeviceIds.begin();
|
||||||
{
|
devicesItr != bestFitDeviceIds.end(); devicesItr++) {
|
||||||
printf("deviceId = %d ", *devicesItr);
|
printf("deviceId = %d ", *devicesItr);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
@ -442,32 +450,33 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p capable,
|
// if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p
|
||||||
|
// capable,
|
||||||
// hence we add it without p2p capability check.
|
// hence we add it without p2p capability check.
|
||||||
if (!bestFitDeviceIds.size())
|
if (!bestFitDeviceIds.size()) {
|
||||||
{
|
printf("Devices involved are not p2p capable.. selecting %zu of them\n",
|
||||||
printf("Devices involved are not p2p capable.. selecting %zu of them\n", kNumGpusRequired);
|
kNumGpusRequired);
|
||||||
std::for_each(bestFit.first, bestFit.second, [&bestFitDeviceIds](decltype(*bestFit.first) mapPair) {
|
std::for_each(bestFit.first, bestFit.second,
|
||||||
|
[&bestFitDeviceIds,
|
||||||
|
&kNumGpusRequired](decltype(*bestFit.first) mapPair) {
|
||||||
if (bestFitDeviceIds.size() < kNumGpusRequired) {
|
if (bestFitDeviceIds.size() < kNumGpusRequired) {
|
||||||
bestFitDeviceIds.emplace(mapPair.second);
|
bestFitDeviceIds.emplace(mapPair.second);
|
||||||
}
|
} else {
|
||||||
else {
|
printf("Ignoring device %i (max devices exceeded)\n",
|
||||||
printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);
|
mapPair.second);
|
||||||
}
|
}
|
||||||
// Insert the sequence into the deviceIds set
|
// Insert the sequence into the deviceIds set
|
||||||
});
|
});
|
||||||
}
|
} else {
|
||||||
else
|
// perform cudaDeviceEnablePeerAccess in both directions for all
|
||||||
{
|
// participating devices of a cudaLaunchCooperativeKernelMultiDevice call
|
||||||
// perform cudaDeviceEnablePeerAccess in both directions for all participating devices
|
// this gives better performance for multi_grid sync.
|
||||||
// of a cudaLaunchCooperativeKernelMultiDevice call this gives better performance for multi_grid sync.
|
for (auto p1_itr = bestFitDeviceIds.begin();
|
||||||
for (auto p1_itr = bestFitDeviceIds.begin(); p1_itr != bestFitDeviceIds.end(); p1_itr++)
|
p1_itr != bestFitDeviceIds.end(); p1_itr++) {
|
||||||
{
|
|
||||||
checkCudaErrors(cudaSetDevice(*p1_itr));
|
checkCudaErrors(cudaSetDevice(*p1_itr));
|
||||||
for (auto p2_itr = bestFitDeviceIds.begin(); p2_itr != bestFitDeviceIds.end(); p2_itr++)
|
for (auto p2_itr = bestFitDeviceIds.begin();
|
||||||
{
|
p2_itr != bestFitDeviceIds.end(); p2_itr++) {
|
||||||
if (*p1_itr != *p2_itr)
|
if (*p1_itr != *p2_itr) {
|
||||||
{
|
|
||||||
checkCudaErrors(cudaDeviceEnablePeerAccess(*p2_itr, 0));
|
checkCudaErrors(cudaDeviceEnablePeerAccess(*p2_itr, 0));
|
||||||
checkCudaErrors(cudaSetDevice(*p1_itr));
|
checkCudaErrors(cudaSetDevice(*p1_itr));
|
||||||
}
|
}
|
||||||
|
@ -532,14 +541,13 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
int numBlocksPerSm_current = 0;
|
int numBlocksPerSm_current = 0;
|
||||||
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||||
&numBlocksPerSm_current, multiGpuConjugateGradient, numThreads, sMemSize));
|
&numBlocksPerSm_current, multiGpuConjugateGradient, numThreads,
|
||||||
|
sMemSize));
|
||||||
|
|
||||||
if (numBlocksPerSm > numBlocksPerSm_current)
|
if (numBlocksPerSm > numBlocksPerSm_current) {
|
||||||
{
|
|
||||||
numBlocksPerSm = numBlocksPerSm_current;
|
numBlocksPerSm = numBlocksPerSm_current;
|
||||||
}
|
}
|
||||||
if (numSms > deviceProp.multiProcessorCount)
|
if (numSms > deviceProp.multiProcessorCount) {
|
||||||
{
|
|
||||||
numSms = deviceProp.multiProcessorCount;
|
numSms = deviceProp.multiProcessorCount;
|
||||||
}
|
}
|
||||||
deviceId++;
|
deviceId++;
|
||||||
|
@ -554,7 +562,7 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
int device_count = 0;
|
int device_count = 0;
|
||||||
int totalThreadsPerGPU = numSms * numBlocksPerSm * THREADS_PER_BLOCK;
|
int totalThreadsPerGPU = numSms * numBlocksPerSm * THREADS_PER_BLOCK;
|
||||||
deviceId = bestFitDeviceIds.begin();;
|
deviceId = bestFitDeviceIds.begin();
|
||||||
while (deviceId != bestFitDeviceIds.end()) {
|
while (deviceId != bestFitDeviceIds.end()) {
|
||||||
checkCudaErrors(cudaSetDevice(*deviceId));
|
checkCudaErrors(cudaSetDevice(*deviceId));
|
||||||
checkCudaErrors(cudaStreamCreate(&nStreams[device_count]));
|
checkCudaErrors(cudaStreamCreate(&nStreams[device_count]));
|
||||||
|
@ -621,14 +629,15 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
printf("Total threads per GPU = %d numBlocksPerSm = %d\n",
|
printf("Total threads per GPU = %d numBlocksPerSm = %d\n",
|
||||||
numSms * numBlocksPerSm * THREADS_PER_BLOCK, numBlocksPerSm);
|
numSms * numBlocksPerSm * THREADS_PER_BLOCK, numBlocksPerSm);
|
||||||
dim3 dimGrid(numSms * numBlocksPerSm, 1, 1), dimBlock(THREADS_PER_BLOCK, 1, 1);
|
dim3 dimGrid(numSms * numBlocksPerSm, 1, 1),
|
||||||
|
dimBlock(THREADS_PER_BLOCK, 1, 1);
|
||||||
void *kernelArgs[] = {
|
void *kernelArgs[] = {
|
||||||
(void *)&I, (void *)&J, (void *)&val, (void *)&x,
|
(void *)&I, (void *)&J, (void *)&val, (void *)&x,
|
||||||
(void *)&Ax, (void *)&p, (void *)&r, (void *)&dot_result,
|
(void *)&Ax, (void *)&p, (void *)&r, (void *)&dot_result,
|
||||||
(void *)&nz, (void *)&N, (void *)&tol,
|
(void *)&nz, (void *)&N, (void *)&tol,
|
||||||
};
|
};
|
||||||
cudaLaunchParams *launchParamsList = (cudaLaunchParams *)malloc(
|
cudaLaunchParams *launchParamsList =
|
||||||
sizeof(cudaLaunchParams) * kNumGpusRequired);
|
(cudaLaunchParams *)malloc(sizeof(cudaLaunchParams) * kNumGpusRequired);
|
||||||
for (int i = 0; i < kNumGpusRequired; i++) {
|
for (int i = 0; i < kNumGpusRequired; i++) {
|
||||||
launchParamsList[i].func = (void *)multiGpuConjugateGradient;
|
launchParamsList[i].func = (void *)multiGpuConjugateGradient;
|
||||||
launchParamsList[i].gridDim = dimGrid;
|
launchParamsList[i].gridDim = dimGrid;
|
||||||
|
@ -645,12 +654,11 @@ int main(int argc, char **argv) {
|
||||||
cudaCooperativeLaunchMultiDeviceNoPreSync |
|
cudaCooperativeLaunchMultiDeviceNoPreSync |
|
||||||
cudaCooperativeLaunchMultiDeviceNoPostSync));
|
cudaCooperativeLaunchMultiDeviceNoPostSync));
|
||||||
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId));
|
||||||
cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId));
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(
|
||||||
cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId));
|
cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId));
|
||||||
|
|
||||||
deviceId = bestFitDeviceIds.begin();;
|
deviceId = bestFitDeviceIds.begin();
|
||||||
device_count = 0;
|
device_count = 0;
|
||||||
while (deviceId != bestFitDeviceIds.end()) {
|
while (deviceId != bestFitDeviceIds.end()) {
|
||||||
checkCudaErrors(cudaSetDevice(*deviceId));
|
checkCudaErrors(cudaSetDevice(*deviceId));
|
||||||
|
@ -658,7 +666,7 @@ int main(int argc, char **argv) {
|
||||||
deviceId++;
|
deviceId++;
|
||||||
}
|
}
|
||||||
|
|
||||||
r1 = *dot_result;
|
r1 = (float)*dot_result;
|
||||||
|
|
||||||
printf("GPU Final, residual = %e \n ", sqrt(r1));
|
printf("GPU Final, residual = %e \n ", sqrt(r1));
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -109,6 +109,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -105,6 +105,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -271,6 +271,12 @@ ifeq ($(TARGET_ARCH),armv7l)
|
||||||
SAMPLE_ENABLED := 0
|
SAMPLE_ENABLED := 0
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# This sample is not supported on QNX
|
||||||
|
ifeq ($(TARGET_OS),qnx)
|
||||||
|
$(info >>> WARNING - cuSolverDn_LinearSolver is not supported on QNX - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(TARGET_OS),linux)
|
ifeq ($(TARGET_OS),linux)
|
||||||
ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\"
|
ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\"
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -110,6 +110,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -106,6 +106,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -265,6 +265,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||||
|
|
||||||
SAMPLE_ENABLED := 1
|
SAMPLE_ENABLED := 1
|
||||||
|
|
||||||
|
# This sample is not supported on QNX
|
||||||
|
ifeq ($(TARGET_OS),qnx)
|
||||||
|
$(info >>> WARNING - cuSolverSp_LinearSolver is not supported on QNX - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(TARGET_OS),linux)
|
ifeq ($(TARGET_OS),linux)
|
||||||
ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\"
|
ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\"
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -495,13 +495,13 @@ int main(int argc, char *argv[]) {
|
||||||
size_t bufferSize = 0;
|
size_t bufferSize = 0;
|
||||||
checkCudaErrors(cusparseSpMV_bufferSize(
|
checkCudaErrors(cusparseSpMV_bufferSize(
|
||||||
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,
|
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,
|
||||||
&one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize));
|
&one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
|
||||||
void *buffer = NULL;
|
void *buffer = NULL;
|
||||||
checkCudaErrors(cudaMalloc(&buffer, bufferSize));
|
checkCudaErrors(cudaMalloc(&buffer, bufferSize));
|
||||||
|
|
||||||
checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||||
&minus_one, matA, vecx, &one, vecAx, CUDA_R_64F,
|
&minus_one, matA, vecx, &one, vecAx, CUDA_R_64F,
|
||||||
CUSPARSE_MV_ALG_DEFAULT, &buffer));
|
CUSPARSE_SPMV_ALG_DEFAULT, buffer));
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA,
|
checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA,
|
||||||
cudaMemcpyDeviceToHost, stream));
|
cudaMemcpyDeviceToHost, stream));
|
||||||
|
@ -559,7 +559,7 @@ int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||||
&minus_one, matA, vecx, &one, vecAx, CUDA_R_64F,
|
&minus_one, matA, vecx, &one, vecAx, CUDA_R_64F,
|
||||||
CUSPARSE_MV_ALG_DEFAULT, &buffer));
|
CUSPARSE_SPMV_ALG_DEFAULT, buffer));
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(h_x, d_x, sizeof(double) * colsA,
|
checkCudaErrors(cudaMemcpyAsync(h_x, d_x, sizeof(double) * colsA,
|
||||||
cudaMemcpyDeviceToHost, stream));
|
cudaMemcpyDeviceToHost, stream));
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -110,6 +110,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -106,6 +106,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -30,7 +30,7 @@ cudaMalloc, cudaFree
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -109,6 +109,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -105,6 +105,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -279,6 +279,12 @@ ifeq ($(TARGET_ARCH),armv7l)
|
||||||
SAMPLE_ENABLED := 0
|
SAMPLE_ENABLED := 0
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# This sample is not supported on QNX
|
||||||
|
ifeq ($(TARGET_OS),qnx)
|
||||||
|
$(info >>> WARNING - cudaNvSci is not supported on QNX - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
ALL_LDFLAGS :=
|
ALL_LDFLAGS :=
|
||||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||||
|
|
|
@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaExternalMemoryG
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaExternalMemoryG
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpy
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -109,6 +109,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -105,6 +105,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -31,7 +31,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -27,7 +27,7 @@ cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -125,7 +125,7 @@ int main(int argc, char **argv) {
|
||||||
#endif
|
#endif
|
||||||
printf("%s", msg);
|
printf("%s", msg);
|
||||||
|
|
||||||
printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
|
printf(" (%03d) Multiprocessors, (%03d) CUDA Cores/MP: %d CUDA Cores\n",
|
||||||
deviceProp.multiProcessorCount,
|
deviceProp.multiProcessorCount,
|
||||||
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
||||||
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
||||||
|
@ -250,8 +250,7 @@ int main(int argc, char **argv) {
|
||||||
"device)",
|
"device)",
|
||||||
"Exclusive Process (many threads in one process is able to use "
|
"Exclusive Process (many threads in one process is able to use "
|
||||||
"::cudaSetDevice() with this device)",
|
"::cudaSetDevice() with this device)",
|
||||||
"Unknown",
|
"Unknown", NULL};
|
||||||
NULL};
|
|
||||||
printf(" Compute Mode:\n");
|
printf(" Compute Mode:\n");
|
||||||
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
|
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
|
||||||
}
|
}
|
||||||
|
@ -307,7 +306,8 @@ int main(int argc, char **argv) {
|
||||||
// driver version
|
// driver version
|
||||||
sProfileString += ", CUDA Driver Version = ";
|
sProfileString += ", CUDA Driver Version = ";
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
|
sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000,
|
||||||
|
(driverVersion % 100) / 10);
|
||||||
#else
|
#else
|
||||||
snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
|
snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
|
||||||
(driverVersion % 100) / 10);
|
(driverVersion % 100) / 10);
|
||||||
|
@ -317,7 +317,8 @@ int main(int argc, char **argv) {
|
||||||
// Runtime version
|
// Runtime version
|
||||||
sProfileString += ", CUDA Runtime Version = ";
|
sProfileString += ", CUDA Runtime Version = ";
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
|
sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000,
|
||||||
|
(runtimeVersion % 100) / 10);
|
||||||
#else
|
#else
|
||||||
snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
|
snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
|
||||||
(runtimeVersion % 100) / 10);
|
(runtimeVersion % 100) / 10);
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -30,7 +30,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -28,12 +28,14 @@
|
||||||
/**
|
/**
|
||||||
* Matrix multiplication: C = A * B.
|
* Matrix multiplication: C = A * B.
|
||||||
*
|
*
|
||||||
* This sample demonstrates implements matrix multiplication which makes use of shared memory
|
* This sample demonstrates implements matrix multiplication which makes use of
|
||||||
* to ensure data reuse, the matrix multiplication is done using tiling approach.
|
* shared memory to ensure data reuse, the matrix multiplication is done using
|
||||||
* With compute capability 8.0 or higher the CUDA kernels involved uses asynchronously copy data
|
* tiling approach.
|
||||||
* from global to shared memory; a.k.a., async-copy.
|
* With compute capability 8.0 or higher the CUDA kernels involved uses
|
||||||
* This sample has been written for clarity of exposition to illustrate various CUDA programming
|
* asynchronously copy data from global to shared memory; a.k.a., async-copy.
|
||||||
* principles, not with the goal of providing the most performant generic kernel for matrix multiplication.
|
* This sample has been written for clarity of exposition to illustrate various
|
||||||
|
* CUDA programming principles, not with the goal of providing the most
|
||||||
|
* performant generic kernel for matrix multiplication.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// System includes
|
// System includes
|
||||||
|
@ -55,8 +57,7 @@ namespace cg = cooperative_groups;
|
||||||
#include <helper_functions.h>
|
#include <helper_functions.h>
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
|
||||||
enum kernels
|
enum kernels {
|
||||||
{
|
|
||||||
AsyncCopyMultiStageLargeChunk = 0,
|
AsyncCopyMultiStageLargeChunk = 0,
|
||||||
AsyncCopyLargeChunk = 1,
|
AsyncCopyLargeChunk = 1,
|
||||||
AsyncCopyLargeChunkAWBarrier = 2,
|
AsyncCopyLargeChunkAWBarrier = 2,
|
||||||
|
@ -67,17 +68,22 @@ enum kernels
|
||||||
NaiveLargeChunk = 7
|
NaiveLargeChunk = 7
|
||||||
};
|
};
|
||||||
|
|
||||||
const char* kernelNames[] = {"AsyncCopyMultiStageLargeChunk", "AsyncCopyLargeChunk",
|
const char *kernelNames[] = {"AsyncCopyMultiStageLargeChunk",
|
||||||
"AsyncCopyLargeChunkAWBarrier", "AsyncCopyMultiStageSharedState",
|
"AsyncCopyLargeChunk",
|
||||||
"AsyncCopyMultiStage", "AsyncCopySingleStage", "Naive", "NaiveLargeChunk"};
|
"AsyncCopyLargeChunkAWBarrier",
|
||||||
|
"AsyncCopyMultiStageSharedState",
|
||||||
|
"AsyncCopyMultiStage",
|
||||||
|
"AsyncCopySingleStage",
|
||||||
|
"Naive",
|
||||||
|
"NaiveLargeChunk"};
|
||||||
|
|
||||||
constexpr int blockSize = 16;
|
constexpr int blockSize = 16;
|
||||||
|
|
||||||
// Multi Stage memcpy_async pipeline with large chunk copy
|
// Multi Stage memcpy_async pipeline with large chunk copy
|
||||||
template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyMultiStageLargeChunk(float* __restrict__ C,
|
template <int BLOCK_SIZE>
|
||||||
const float* __restrict__ A,
|
__global__ void MatrixMulAsyncCopyMultiStageLargeChunk(
|
||||||
const float* __restrict__ B, int wA,
|
float *__restrict__ C, const float *__restrict__ A,
|
||||||
int wB) {
|
const float *__restrict__ B, int wA, int wB) {
|
||||||
// Requires BLOCK_SIZE % 4 == 0
|
// Requires BLOCK_SIZE % 4 == 0
|
||||||
|
|
||||||
// Multi-stage pipeline version
|
// Multi-stage pipeline version
|
||||||
|
@ -85,11 +91,13 @@ template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyMultiStageLargeChunk
|
||||||
|
|
||||||
// Declaration of the shared memory array As used to
|
// Declaration of the shared memory array As used to
|
||||||
// store the sub-matrix of A for each stage
|
// store the sub-matrix of A for each stage
|
||||||
__shared__ alignas(alignof(float4)) float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
|
__shared__ alignas(
|
||||||
|
alignof(float4)) float As[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
|
||||||
|
|
||||||
// Declaration of the shared memory array Bs used to
|
// Declaration of the shared memory array Bs used to
|
||||||
// store the sub-matrix of B for each stage
|
// store the sub-matrix of B for each stage
|
||||||
__shared__ alignas(alignof(float4)) float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
|
__shared__ alignas(
|
||||||
|
alignof(float4)) float Bs[maxPipelineStages][BLOCK_SIZE][BLOCK_SIZE];
|
||||||
|
|
||||||
float Csub = 0.0;
|
float Csub = 0.0;
|
||||||
|
|
||||||
|
@ -115,18 +123,21 @@ template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyMultiStageLargeChunk
|
||||||
|
|
||||||
// Loop over all the sub-matrices of A and B
|
// Loop over all the sub-matrices of A and B
|
||||||
// required to compute the block sub-matrix
|
// required to compute the block sub-matrix
|
||||||
for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; a += aStep, b += bStep, ++i ) {
|
for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin,
|
||||||
|
iStage = 0;
|
||||||
|
a <= aEnd; a += aStep, b += bStep, ++i) {
|
||||||
// Load the matrices from device memory to shared memory; each thread loads
|
// Load the matrices from device memory to shared memory; each thread loads
|
||||||
// one element of each matrix
|
// one element of each matrix
|
||||||
for ( ; aStage <= a + aStep * maxPipelineStages ; aStage += aStep, bStage += bStep, ++iStage )
|
for (; aStage <= a + aStep * maxPipelineStages;
|
||||||
{
|
aStage += aStep, bStage += bStep, ++iStage) {
|
||||||
pipe.producer_acquire();
|
pipe.producer_acquire();
|
||||||
if ( aStage <= aEnd && t4x < BLOCK_SIZE )
|
if (aStage <= aEnd && t4x < BLOCK_SIZE) {
|
||||||
{
|
|
||||||
// Rotating buffer
|
// Rotating buffer
|
||||||
const int j = iStage % maxPipelineStages;
|
const int j = iStage % maxPipelineStages;
|
||||||
cuda::memcpy_async(&As[j][threadIdx.y][t4x], &A[aStage + wA * threadIdx.y + t4x], shape4, pipe);
|
cuda::memcpy_async(&As[j][threadIdx.y][t4x],
|
||||||
cuda::memcpy_async(&Bs[j][threadIdx.y][t4x], &B[aStage + wA * threadIdx.y + t4x], shape4, pipe);
|
&A[aStage + wA * threadIdx.y + t4x], shape4, pipe);
|
||||||
|
cuda::memcpy_async(&Bs[j][threadIdx.y][t4x],
|
||||||
|
&B[aStage + wA * threadIdx.y + t4x], shape4, pipe);
|
||||||
}
|
}
|
||||||
pipe.producer_commit();
|
pipe.producer_commit();
|
||||||
}
|
}
|
||||||
|
@ -157,12 +168,12 @@ template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyMultiStageLargeChunk
|
||||||
C[c + wB * threadIdx.y + threadIdx.x] = Csub;
|
C[c + wB * threadIdx.y + threadIdx.x] = Csub;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Single Stage memcpy_async pipeline with Large copy chunk (float4)
|
// Single Stage memcpy_async pipeline with Large copy chunk (float4)
|
||||||
template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyLargeChunk(float* __restrict__ C,
|
template <int BLOCK_SIZE>
|
||||||
|
__global__ void MatrixMulAsyncCopyLargeChunk(float *__restrict__ C,
|
||||||
const float *__restrict__ A,
|
const float *__restrict__ A,
|
||||||
const float* __restrict__ B, int wA,
|
const float *__restrict__ B,
|
||||||
int wB) {
|
int wA, int wB) {
|
||||||
// Requires BLOCK_SIZE % 4 == 0
|
// Requires BLOCK_SIZE % 4 == 0
|
||||||
|
|
||||||
// Declaration of the shared memory array As used to
|
// Declaration of the shared memory array As used to
|
||||||
|
@ -207,11 +218,12 @@ template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyLargeChunk(float* __
|
||||||
|
|
||||||
// Now, one fourth of the threads load four elements of each matrix
|
// Now, one fourth of the threads load four elements of each matrix
|
||||||
if (t4x < BLOCK_SIZE) {
|
if (t4x < BLOCK_SIZE) {
|
||||||
|
|
||||||
pipe.producer_acquire();
|
pipe.producer_acquire();
|
||||||
|
|
||||||
cuda::memcpy_async(&As[threadIdx.y][t4x], &A[a + wA * threadIdx.y + t4x], shape4, pipe);
|
cuda::memcpy_async(&As[threadIdx.y][t4x], &A[a + wA * threadIdx.y + t4x],
|
||||||
cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[a + wA * threadIdx.y + t4x], shape4, pipe);
|
shape4, pipe);
|
||||||
|
cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[a + wA * threadIdx.y + t4x],
|
||||||
|
shape4, pipe);
|
||||||
|
|
||||||
pipe.producer_commit();
|
pipe.producer_commit();
|
||||||
pipe.consumer_wait();
|
pipe.consumer_wait();
|
||||||
|
@ -242,11 +254,12 @@ template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyLargeChunk(float* __
|
||||||
C[c + wB * threadIdx.y + threadIdx.x] = Csub;
|
C[c + wB * threadIdx.y + threadIdx.x] = Csub;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Single Stage memcpy_async pipeline with Large copy chunk (float4) using arrive-wait barrier
|
// Single Stage memcpy_async pipeline with Large copy chunk (float4) using
|
||||||
template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(float* __restrict__ C,
|
// arrive-wait barrier
|
||||||
const float* __restrict__ A,
|
template <int BLOCK_SIZE>
|
||||||
const float* __restrict__ B, int wA,
|
__global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(
|
||||||
int wB) {
|
float *__restrict__ C, const float *__restrict__ A,
|
||||||
|
const float *__restrict__ B, int wA, int wB) {
|
||||||
#if __CUDA_ARCH__ >= 700
|
#if __CUDA_ARCH__ >= 700
|
||||||
#pragma diag_suppress static_var_with_dynamic_init
|
#pragma diag_suppress static_var_with_dynamic_init
|
||||||
// Requires BLOCK_SIZE % 4 == 0
|
// Requires BLOCK_SIZE % 4 == 0
|
||||||
|
@ -295,8 +308,10 @@ template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(
|
||||||
if (t4x < BLOCK_SIZE) {
|
if (t4x < BLOCK_SIZE) {
|
||||||
float4 *const A4s = reinterpret_cast<float4 *>(&As[threadIdx.y][t4x]);
|
float4 *const A4s = reinterpret_cast<float4 *>(&As[threadIdx.y][t4x]);
|
||||||
float4 *const B4s = reinterpret_cast<float4 *>(&Bs[threadIdx.y][t4x]);
|
float4 *const B4s = reinterpret_cast<float4 *>(&Bs[threadIdx.y][t4x]);
|
||||||
const float4 * const A4 = reinterpret_cast<const float4*>(& A[a + wA * threadIdx.y + t4x]);
|
const float4 *const A4 =
|
||||||
const float4 * const B4 = reinterpret_cast<const float4*>(& B[a + wA * threadIdx.y + t4x]);
|
reinterpret_cast<const float4 *>(&A[a + wA * threadIdx.y + t4x]);
|
||||||
|
const float4 *const B4 =
|
||||||
|
reinterpret_cast<const float4 *>(&B[a + wA * threadIdx.y + t4x]);
|
||||||
|
|
||||||
cuda::memcpy_async(A4s, A4, sizeof(float4), bar);
|
cuda::memcpy_async(A4s, A4, sizeof(float4), bar);
|
||||||
cuda::memcpy_async(B4s, B4, sizeof(float4), bar);
|
cuda::memcpy_async(B4s, B4, sizeof(float4), bar);
|
||||||
|
@ -327,10 +342,9 @@ template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Single Stage memcpy_async pipeline with float copy
|
// Single Stage memcpy_async pipeline with float copy
|
||||||
template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopySingleStage(float *C, const float *A,
|
template <int BLOCK_SIZE>
|
||||||
const float *B, int wA,
|
__global__ void MatrixMulAsyncCopySingleStage(float *C, const float *A,
|
||||||
int wB) {
|
const float *B, int wA, int wB) {
|
||||||
|
|
||||||
// Declaration of the shared memory array As used to
|
// Declaration of the shared memory array As used to
|
||||||
// store the sub-matrix of A
|
// store the sub-matrix of A
|
||||||
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
||||||
|
@ -360,7 +374,6 @@ template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopySingleStage(float *C
|
||||||
cuda::pipeline<cuda::thread_scope_thread> pipe = cuda::make_pipeline();
|
cuda::pipeline<cuda::thread_scope_thread> pipe = cuda::make_pipeline();
|
||||||
const auto shape1 = cuda::aligned_size_t<alignof(float)>(sizeof(float));
|
const auto shape1 = cuda::aligned_size_t<alignof(float)>(sizeof(float));
|
||||||
|
|
||||||
|
|
||||||
// Loop over all the sub-matrices of A and B
|
// Loop over all the sub-matrices of A and B
|
||||||
// required to compute the block sub-matrix
|
// required to compute the block sub-matrix
|
||||||
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
||||||
|
@ -369,8 +382,10 @@ template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopySingleStage(float *C
|
||||||
{
|
{
|
||||||
pipe.producer_acquire();
|
pipe.producer_acquire();
|
||||||
|
|
||||||
cuda::memcpy_async(&As[threadIdx.y][threadIdx.x], &A[a + wA * threadIdx.y + threadIdx.x], shape1, pipe);
|
cuda::memcpy_async(&As[threadIdx.y][threadIdx.x],
|
||||||
cuda::memcpy_async(&Bs[threadIdx.y][threadIdx.x], &B[b + wB * threadIdx.y + threadIdx.x], shape1, pipe);
|
&A[a + wA * threadIdx.y + threadIdx.x], shape1, pipe);
|
||||||
|
cuda::memcpy_async(&Bs[threadIdx.y][threadIdx.x],
|
||||||
|
&B[b + wB * threadIdx.y + threadIdx.x], shape1, pipe);
|
||||||
|
|
||||||
pipe.producer_commit();
|
pipe.producer_commit();
|
||||||
}
|
}
|
||||||
|
@ -399,11 +414,13 @@ template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopySingleStage(float *C
|
||||||
C[c + wB * threadIdx.y + threadIdx.x] = Csub;
|
C[c + wB * threadIdx.y + threadIdx.x] = Csub;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Multi Stage memcpy_async thread_scope_thread pipeline with single-element async-copy
|
// Multi Stage memcpy_async thread_scope_thread pipeline with single-element
|
||||||
template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyMultiStage(float* __restrict__ C,
|
// async-copy
|
||||||
|
template <int BLOCK_SIZE>
|
||||||
|
__global__ void MatrixMulAsyncCopyMultiStage(float *__restrict__ C,
|
||||||
const float *__restrict__ A,
|
const float *__restrict__ A,
|
||||||
const float* __restrict__ B, int wA,
|
const float *__restrict__ B,
|
||||||
int wB) {
|
int wA, int wB) {
|
||||||
// Multi-stage pipeline version
|
// Multi-stage pipeline version
|
||||||
constexpr size_t maxPipelineStages = 4;
|
constexpr size_t maxPipelineStages = 4;
|
||||||
|
|
||||||
|
@ -437,21 +454,26 @@ template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyMultiStage(float* __
|
||||||
|
|
||||||
// Loop over all the sub-matrices of A and B
|
// Loop over all the sub-matrices of A and B
|
||||||
// required to compute the block sub-matrix
|
// required to compute the block sub-matrix
|
||||||
for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0; a <= aEnd; a += aStep, b += bStep, ++i ) {
|
for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin,
|
||||||
|
iStage = 0;
|
||||||
|
a <= aEnd; a += aStep, b += bStep, ++i) {
|
||||||
// Load the matrices from device memory to shared memory; each thread loads
|
// Load the matrices from device memory to shared memory; each thread loads
|
||||||
// one element of each matrix
|
// one element of each matrix
|
||||||
|
|
||||||
for ( ; aStage <= a + aStep * maxPipelineStages ; aStage += aStep, bStage += bStep, ++iStage )
|
for (; aStage <= a + aStep * maxPipelineStages;
|
||||||
{
|
aStage += aStep, bStage += bStep, ++iStage) {
|
||||||
if ( aStage <= aEnd )
|
if (aStage <= aEnd) {
|
||||||
{
|
|
||||||
// Rotating buffer
|
// Rotating buffer
|
||||||
const int j = iStage % maxPipelineStages;
|
const int j = iStage % maxPipelineStages;
|
||||||
|
|
||||||
pipe.producer_acquire();
|
pipe.producer_acquire();
|
||||||
|
|
||||||
cuda::memcpy_async(&As[j][threadIdx.y][threadIdx.x], &A[aStage + wA * threadIdx.y + threadIdx.x], shape1, pipe);
|
cuda::memcpy_async(&As[j][threadIdx.y][threadIdx.x],
|
||||||
cuda::memcpy_async(&Bs[j][threadIdx.y][threadIdx.x], &B[bStage + wB * threadIdx.y + threadIdx.x], shape1, pipe);
|
&A[aStage + wA * threadIdx.y + threadIdx.x], shape1,
|
||||||
|
pipe);
|
||||||
|
cuda::memcpy_async(&Bs[j][threadIdx.y][threadIdx.x],
|
||||||
|
&B[bStage + wB * threadIdx.y + threadIdx.x], shape1,
|
||||||
|
pipe);
|
||||||
|
|
||||||
pipe.producer_commit();
|
pipe.producer_commit();
|
||||||
}
|
}
|
||||||
|
@ -484,11 +506,12 @@ template <int BLOCK_SIZE> __global__ void MatrixMulAsyncCopyMultiStage(float* __
|
||||||
// Multi Stage shared state memcpy_async pipeline thread_scope_block
|
// Multi Stage shared state memcpy_async pipeline thread_scope_block
|
||||||
// with parititioned producer & consumer, here we've 1 warp as producer
|
// with parititioned producer & consumer, here we've 1 warp as producer
|
||||||
// group which issues memcpy_async operations and rest all warps are part of
|
// group which issues memcpy_async operations and rest all warps are part of
|
||||||
// consumer group which perform gemm computation on the loaded matrices by producer.
|
// consumer group which perform gemm computation on the loaded matrices by
|
||||||
template <int BLOCK_SIZE_X> __global__ void MatrixMulAsyncCopyMultiStageSharedState(float* __restrict__ C,
|
// producer.
|
||||||
const float* __restrict__ A,
|
template <int BLOCK_SIZE_X>
|
||||||
const float* __restrict__ B, int wA,
|
__global__ void MatrixMulAsyncCopyMultiStageSharedState(
|
||||||
int wB) {
|
float *__restrict__ C, const float *__restrict__ A,
|
||||||
|
const float *__restrict__ B, int wA, int wB) {
|
||||||
// Multi-stage pipeline version
|
// Multi-stage pipeline version
|
||||||
constexpr size_t maxPipelineStages = 4;
|
constexpr size_t maxPipelineStages = 4;
|
||||||
|
|
||||||
|
@ -520,7 +543,8 @@ template <int BLOCK_SIZE_X> __global__ void MatrixMulAsyncCopyMultiStageSharedSt
|
||||||
auto cta = cg::this_thread_block();
|
auto cta = cg::this_thread_block();
|
||||||
|
|
||||||
const auto shape1 = cuda::aligned_size_t<alignof(float)>(sizeof(float));
|
const auto shape1 = cuda::aligned_size_t<alignof(float)>(sizeof(float));
|
||||||
__shared__ cuda::pipeline_shared_state<cuda::thread_scope_block, maxPipelineStages> shared_state;
|
__shared__ cuda::pipeline_shared_state<cuda::thread_scope_block,
|
||||||
|
maxPipelineStages> shared_state;
|
||||||
constexpr int consumer_row_count = BLOCK_SIZE_X;
|
constexpr int consumer_row_count = BLOCK_SIZE_X;
|
||||||
|
|
||||||
const auto thread_role = (cta.thread_index().y < consumer_row_count)
|
const auto thread_role = (cta.thread_index().y < consumer_row_count)
|
||||||
|
@ -530,32 +554,40 @@ template <int BLOCK_SIZE_X> __global__ void MatrixMulAsyncCopyMultiStageSharedSt
|
||||||
|
|
||||||
// Loop over all the sub-matrices of A and B
|
// Loop over all the sub-matrices of A and B
|
||||||
// required to compute the block sub-matrix
|
// required to compute the block sub-matrix
|
||||||
for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin, iStage = 0;
|
for (int a = aBegin, b = bBegin, i = 0, aStage = aBegin, bStage = bBegin,
|
||||||
|
iStage = 0;
|
||||||
a <= aEnd; a += aStep, b += bStep, ++i) {
|
a <= aEnd; a += aStep, b += bStep, ++i) {
|
||||||
if (threadIdx.y >= consumer_row_count) {
|
if (threadIdx.y >= consumer_row_count) {
|
||||||
// this is a whole producer warp because threadIdx.y >= 16 where 16 == consumer_row_count,
|
// this is a whole producer warp because threadIdx.y >= 16 where 16 ==
|
||||||
|
// consumer_row_count,
|
||||||
// which loads the matrices from device memory to shared memory;
|
// which loads the matrices from device memory to shared memory;
|
||||||
for (; aStage <= a + aStep * maxPipelineStages; aStage += aStep, bStage += bStep, ++iStage) {
|
for (; aStage <= a + aStep * maxPipelineStages;
|
||||||
|
aStage += aStep, bStage += bStep, ++iStage) {
|
||||||
if (aStage <= aEnd) {
|
if (aStage <= aEnd) {
|
||||||
// Rotating buffer
|
// Rotating buffer
|
||||||
const int j = iStage % maxPipelineStages;
|
const int j = iStage % maxPipelineStages;
|
||||||
const int strideRows = (blockDim.y - consumer_row_count);
|
const int strideRows = (blockDim.y - consumer_row_count);
|
||||||
pipe.producer_acquire();
|
pipe.producer_acquire();
|
||||||
for (int rowId = threadIdx.y - consumer_row_count; rowId < BLOCK_SIZE_X; rowId += strideRows) {
|
for (int rowId = threadIdx.y - consumer_row_count;
|
||||||
|
rowId < BLOCK_SIZE_X; rowId += strideRows) {
|
||||||
cuda::memcpy_async(&As[j][rowId][threadIdx.x],
|
cuda::memcpy_async(&As[j][rowId][threadIdx.x],
|
||||||
&A[aStage + wA * rowId + threadIdx.x], shape1, pipe);
|
&A[aStage + wA * rowId + threadIdx.x], shape1,
|
||||||
|
pipe);
|
||||||
cuda::memcpy_async(&Bs[j][rowId][threadIdx.x],
|
cuda::memcpy_async(&Bs[j][rowId][threadIdx.x],
|
||||||
&B[bStage + wB * rowId + threadIdx.x], shape1, pipe);
|
&B[bStage + wB * rowId + threadIdx.x], shape1,
|
||||||
|
pipe);
|
||||||
}
|
}
|
||||||
pipe.producer_commit();
|
pipe.producer_commit();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else {
|
// this is a whole set of consumer group because threadIdx.y <
|
||||||
// this is a whole set of consumer group because threadIdx.y < consumer_row_count where consumer_row_count == 16,
|
// consumer_row_count where consumer_row_count == 16,
|
||||||
// which computes gemm operation on matrices loaded in shared memory by producer warp.
|
// which computes gemm operation on matrices loaded in shared memory by
|
||||||
|
// producer warp.
|
||||||
const int j = i % maxPipelineStages;
|
const int j = i % maxPipelineStages;
|
||||||
// Synchronize consumer group to make sure the matrices are loaded by producer group.
|
// Synchronize consumer group to make sure the matrices are loaded by
|
||||||
|
// producer group.
|
||||||
pipe.consumer_wait();
|
pipe.consumer_wait();
|
||||||
// Multiply the two matrices together;
|
// Multiply the two matrices together;
|
||||||
// each thread computes one element
|
// each thread computes one element
|
||||||
|
@ -570,8 +602,7 @@ template <int BLOCK_SIZE_X> __global__ void MatrixMulAsyncCopyMultiStageSharedSt
|
||||||
|
|
||||||
// Write the block sub-matrix to device memory;
|
// Write the block sub-matrix to device memory;
|
||||||
// each thread writes four element
|
// each thread writes four element
|
||||||
if (threadIdx.y < consumer_row_count)
|
if (threadIdx.y < consumer_row_count) {
|
||||||
{
|
|
||||||
const int c = wB * BLOCK_SIZE_X * blockIdx.y + BLOCK_SIZE_X * blockIdx.x;
|
const int c = wB * BLOCK_SIZE_X * blockIdx.y + BLOCK_SIZE_X * blockIdx.x;
|
||||||
C[c + wB * threadIdx.y + threadIdx.x] = Csub;
|
C[c + wB * threadIdx.y + threadIdx.x] = Csub;
|
||||||
}
|
}
|
||||||
|
@ -581,9 +612,8 @@ template <int BLOCK_SIZE_X> __global__ void MatrixMulAsyncCopyMultiStageSharedSt
|
||||||
* Matrix multiplication (CUDA Kernel) on the device: C = A * B
|
* Matrix multiplication (CUDA Kernel) on the device: C = A * B
|
||||||
* wA is A's width and wB is B's width
|
* wA is A's width and wB is B's width
|
||||||
*/
|
*/
|
||||||
template <int BLOCK_SIZE> __global__ void MatrixMulNaive(float *C, float *A,
|
template <int BLOCK_SIZE>
|
||||||
float *B, int wA,
|
__global__ void MatrixMulNaive(float *C, float *A, float *B, int wA, int wB) {
|
||||||
int wB) {
|
|
||||||
// Declaration of the shared memory array As used to
|
// Declaration of the shared memory array As used to
|
||||||
// store the sub-matrix of A
|
// store the sub-matrix of A
|
||||||
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
||||||
|
@ -613,10 +643,7 @@ template <int BLOCK_SIZE> __global__ void MatrixMulNaive(float *C, float *A,
|
||||||
|
|
||||||
// Loop over all the sub-matrices of A and B
|
// Loop over all the sub-matrices of A and B
|
||||||
// required to compute the block sub-matrix
|
// required to compute the block sub-matrix
|
||||||
for (int a = aBegin, b = bBegin;
|
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
||||||
a <= aEnd;
|
|
||||||
a += aStep, b += bStep) {
|
|
||||||
|
|
||||||
// Load the matrices from device memory
|
// Load the matrices from device memory
|
||||||
// to shared memory; each thread loads
|
// to shared memory; each thread loads
|
||||||
// one element of each matrix
|
// one element of each matrix
|
||||||
|
@ -646,8 +673,8 @@ template <int BLOCK_SIZE> __global__ void MatrixMulNaive(float *C, float *A,
|
||||||
C[c + wB * threadIdx.y + threadIdx.x] = Csub;
|
C[c + wB * threadIdx.y + threadIdx.x] = Csub;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int BLOCK_SIZE> __global__ void MatrixMulNaiveLargeChunk(float *C, float *A,
|
template <int BLOCK_SIZE>
|
||||||
float *B, int wA,
|
__global__ void MatrixMulNaiveLargeChunk(float *C, float *A, float *B, int wA,
|
||||||
int wB) {
|
int wB) {
|
||||||
// Declaration of the shared memory array As used to
|
// Declaration of the shared memory array As used to
|
||||||
// store the sub-matrix of A
|
// store the sub-matrix of A
|
||||||
|
@ -680,10 +707,7 @@ template <int BLOCK_SIZE> __global__ void MatrixMulNaiveLargeChunk(float *C, flo
|
||||||
|
|
||||||
// Loop over all the sub-matrices of A and B
|
// Loop over all the sub-matrices of A and B
|
||||||
// required to compute the block sub-matrix
|
// required to compute the block sub-matrix
|
||||||
for (int a = aBegin, b = bBegin;
|
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
||||||
a <= aEnd;
|
|
||||||
a += aStep, b += bStep) {
|
|
||||||
|
|
||||||
// Load the matrices from device memory
|
// Load the matrices from device memory
|
||||||
// to shared memory;
|
// to shared memory;
|
||||||
|
|
||||||
|
@ -691,8 +715,10 @@ template <int BLOCK_SIZE> __global__ void MatrixMulNaiveLargeChunk(float *C, flo
|
||||||
if (t4x < BLOCK_SIZE) {
|
if (t4x < BLOCK_SIZE) {
|
||||||
float4 *const A4s = reinterpret_cast<float4 *>(&As[threadIdx.y][t4x]);
|
float4 *const A4s = reinterpret_cast<float4 *>(&As[threadIdx.y][t4x]);
|
||||||
float4 *const B4s = reinterpret_cast<float4 *>(&Bs[threadIdx.y][t4x]);
|
float4 *const B4s = reinterpret_cast<float4 *>(&Bs[threadIdx.y][t4x]);
|
||||||
const float4 * const A4 = reinterpret_cast<float4*>(& A[a + wA * threadIdx.y + t4x]);
|
const float4 *const A4 =
|
||||||
const float4 * const B4 = reinterpret_cast<float4*>(& B[a + wA * threadIdx.y + t4x]);
|
reinterpret_cast<float4 *>(&A[a + wA * threadIdx.y + t4x]);
|
||||||
|
const float4 *const B4 =
|
||||||
|
reinterpret_cast<float4 *>(&B[a + wA * threadIdx.y + t4x]);
|
||||||
*A4s = *A4;
|
*A4s = *A4;
|
||||||
*B4s = *B4;
|
*B4s = *B4;
|
||||||
}
|
}
|
||||||
|
@ -720,7 +746,6 @@ template <int BLOCK_SIZE> __global__ void MatrixMulNaiveLargeChunk(float *C, flo
|
||||||
C[c + wB * threadIdx.y + threadIdx.x] = Csub;
|
C[c + wB * threadIdx.y + threadIdx.x] = Csub;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void ConstantInit(float *data, int size, float val) {
|
void ConstantInit(float *data, int size, float val) {
|
||||||
for (int i = 0; i < size; ++i) {
|
for (int i = 0; i < size; ++i) {
|
||||||
data[i] = val;
|
data[i] = val;
|
||||||
|
@ -730,9 +755,7 @@ void ConstantInit(float *data, int size, float val) {
|
||||||
/**
|
/**
|
||||||
* Run matrix multiplication using CUDA
|
* Run matrix multiplication using CUDA
|
||||||
*/
|
*/
|
||||||
int MatrixMultiply(int argc, char **argv,
|
int MatrixMultiply(int argc, char **argv, const dim3 &dimsA, const dim3 &dimsB,
|
||||||
const dim3 &dimsA,
|
|
||||||
const dim3 &dimsB,
|
|
||||||
kernels kernel_number) {
|
kernels kernel_number) {
|
||||||
// Allocate host memory for matrices A and B
|
// Allocate host memory for matrices A and B
|
||||||
unsigned int size_A = dimsA.x * dimsA.y;
|
unsigned int size_A = dimsA.x * dimsA.y;
|
||||||
|
@ -775,8 +798,10 @@ int MatrixMultiply(int argc, char **argv,
|
||||||
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||||
|
|
||||||
// copy host memory to device
|
// copy host memory to device
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
|
checkCudaErrors(
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
|
cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
|
||||||
|
checkCudaErrors(
|
||||||
|
cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
|
||||||
checkCudaErrors(cudaMemsetAsync(d_C, 0, mem_size_C, stream));
|
checkCudaErrors(cudaMemsetAsync(d_C, 0, mem_size_C, stream));
|
||||||
|
|
||||||
// Setup execution parameters
|
// Setup execution parameters
|
||||||
|
@ -786,47 +811,57 @@ int MatrixMultiply(int argc, char **argv,
|
||||||
// Here the block size is 16x18, where first 16 rows are consumer thread group
|
// Here the block size is 16x18, where first 16 rows are consumer thread group
|
||||||
// and last 2 rows (1 warp) is producer thread group
|
// and last 2 rows (1 warp) is producer thread group
|
||||||
dim3 threadsSharedStateKernel(blockSize, blockSize + 2, 1);
|
dim3 threadsSharedStateKernel(blockSize, blockSize + 2, 1);
|
||||||
dim3 gridSharedStateKernel(dimsB.x / threadsSharedStateKernel.x, dimsA.y / threadsSharedStateKernel.x);
|
dim3 gridSharedStateKernel(dimsB.x / threadsSharedStateKernel.x,
|
||||||
|
dimsA.y / threadsSharedStateKernel.x);
|
||||||
|
|
||||||
printf("Running kernel = %d - %s\n", kernel_number, kernelNames[kernel_number]);
|
printf("Running kernel = %d - %s\n", kernel_number,
|
||||||
|
kernelNames[kernel_number]);
|
||||||
// Create and start timer
|
// Create and start timer
|
||||||
printf("Computing result using CUDA Kernel...\n");
|
printf("Computing result using CUDA Kernel...\n");
|
||||||
|
|
||||||
// Performs warmup operation using matrixMul CUDA kernel
|
// Performs warmup operation using matrixMul CUDA kernel
|
||||||
switch (kernel_number)
|
switch (kernel_number) {
|
||||||
{
|
|
||||||
case AsyncCopyMultiStageLargeChunk:
|
case AsyncCopyMultiStageLargeChunk:
|
||||||
default:
|
default:
|
||||||
MatrixMulAsyncCopyMultiStageLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulAsyncCopyMultiStageLargeChunk<
|
||||||
|
blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x,
|
||||||
|
dimsB.x);
|
||||||
break;
|
break;
|
||||||
case AsyncCopyLargeChunk:
|
case AsyncCopyLargeChunk:
|
||||||
MatrixMulAsyncCopyLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulAsyncCopyLargeChunk<blockSize><<<grid, threads, 0, stream>>>(
|
||||||
|
d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
break;
|
break;
|
||||||
case AsyncCopyLargeChunkAWBarrier:
|
case AsyncCopyLargeChunkAWBarrier:
|
||||||
MatrixMulAsyncCopyLargeChunkAWBarrier<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulAsyncCopyLargeChunkAWBarrier<
|
||||||
|
blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x,
|
||||||
|
dimsB.x);
|
||||||
break;
|
break;
|
||||||
case AsyncCopyMultiStageSharedState:
|
case AsyncCopyMultiStageSharedState:
|
||||||
MatrixMulAsyncCopyMultiStageSharedState<blockSize><<<gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>
|
MatrixMulAsyncCopyMultiStageSharedState<blockSize><<<
|
||||||
(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>(
|
||||||
|
d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
break;
|
break;
|
||||||
case AsyncCopyMultiStage:
|
case AsyncCopyMultiStage:
|
||||||
MatrixMulAsyncCopyMultiStage<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulAsyncCopyMultiStage<blockSize><<<grid, threads, 0, stream>>>(
|
||||||
|
d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
break;
|
break;
|
||||||
case AsyncCopySingleStage:
|
case AsyncCopySingleStage:
|
||||||
MatrixMulAsyncCopySingleStage<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulAsyncCopySingleStage<blockSize><<<grid, threads, 0, stream>>>(
|
||||||
|
d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
break;
|
break;
|
||||||
case Naive:
|
case Naive:
|
||||||
MatrixMulNaive<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulNaive<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B,
|
||||||
|
dimsA.x, dimsB.x);
|
||||||
break;
|
break;
|
||||||
case NaiveLargeChunk:
|
case NaiveLargeChunk:
|
||||||
MatrixMulNaiveLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulNaiveLargeChunk<blockSize><<<grid, threads, 0, stream>>>(
|
||||||
|
d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("done\n");
|
printf("done\n");
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
|
|
||||||
// Execute the kernel
|
// Execute the kernel
|
||||||
int nIter = 100;
|
int nIter = 100;
|
||||||
|
|
||||||
|
@ -834,33 +869,42 @@ int MatrixMultiply(int argc, char **argv,
|
||||||
checkCudaErrors(cudaEventRecord(start, stream));
|
checkCudaErrors(cudaEventRecord(start, stream));
|
||||||
|
|
||||||
for (int j = 0; j < nIter; j++) {
|
for (int j = 0; j < nIter; j++) {
|
||||||
switch (kernel_number)
|
switch (kernel_number) {
|
||||||
{
|
|
||||||
case AsyncCopyMultiStageLargeChunk:
|
case AsyncCopyMultiStageLargeChunk:
|
||||||
default:
|
default:
|
||||||
MatrixMulAsyncCopyMultiStageLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulAsyncCopyMultiStageLargeChunk<
|
||||||
|
blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x,
|
||||||
|
dimsB.x);
|
||||||
break;
|
break;
|
||||||
case AsyncCopyLargeChunk:
|
case AsyncCopyLargeChunk:
|
||||||
MatrixMulAsyncCopyLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulAsyncCopyLargeChunk<blockSize><<<grid, threads, 0, stream>>>(
|
||||||
|
d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
break;
|
break;
|
||||||
case AsyncCopyLargeChunkAWBarrier:
|
case AsyncCopyLargeChunkAWBarrier:
|
||||||
MatrixMulAsyncCopyLargeChunkAWBarrier<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulAsyncCopyLargeChunkAWBarrier<
|
||||||
|
blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x,
|
||||||
|
dimsB.x);
|
||||||
break;
|
break;
|
||||||
case AsyncCopyMultiStageSharedState:
|
case AsyncCopyMultiStageSharedState:
|
||||||
MatrixMulAsyncCopyMultiStageSharedState<blockSize><<<gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>
|
MatrixMulAsyncCopyMultiStageSharedState<blockSize><<<
|
||||||
(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
gridSharedStateKernel, threadsSharedStateKernel, 0, stream>>>(
|
||||||
|
d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
break;
|
break;
|
||||||
case AsyncCopyMultiStage:
|
case AsyncCopyMultiStage:
|
||||||
MatrixMulAsyncCopyMultiStage<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulAsyncCopyMultiStage<blockSize><<<grid, threads, 0, stream>>>(
|
||||||
|
d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
break;
|
break;
|
||||||
case AsyncCopySingleStage:
|
case AsyncCopySingleStage:
|
||||||
MatrixMulAsyncCopySingleStage<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulAsyncCopySingleStage<blockSize><<<grid, threads, 0, stream>>>(
|
||||||
|
d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
break;
|
break;
|
||||||
case Naive:
|
case Naive:
|
||||||
MatrixMulNaive<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulNaive<blockSize><<<grid, threads, 0, stream>>>(
|
||||||
|
d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
break;
|
break;
|
||||||
case NaiveLargeChunk:
|
case NaiveLargeChunk:
|
||||||
MatrixMulNaiveLargeChunk<blockSize><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
MatrixMulNaiveLargeChunk<blockSize><<<grid, threads, 0, stream>>>(
|
||||||
|
d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -879,18 +923,16 @@ int MatrixMultiply(int argc, char **argv,
|
||||||
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
|
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
|
||||||
static_cast<double>(dimsA.y) *
|
static_cast<double>(dimsA.y) *
|
||||||
static_cast<double>(dimsB.x);
|
static_cast<double>(dimsB.x);
|
||||||
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) /
|
double gigaFlops =
|
||||||
(msecPerMatrixMul / 1000.0f);
|
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
|
||||||
printf(
|
printf(
|
||||||
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," \
|
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
|
||||||
" WorkgroupSize= %u threads/block\n",
|
" WorkgroupSize= %u threads/block\n",
|
||||||
gigaFlops,
|
gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
|
||||||
msecPerMatrixMul,
|
|
||||||
flopsPerMatrixMul,
|
|
||||||
threads.x * threads.y);
|
|
||||||
|
|
||||||
// Copy result from device to host
|
// Copy result from device to host
|
||||||
checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
|
checkCudaErrors(
|
||||||
|
cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
printf("Checking computed result for correctness: ");
|
printf("Checking computed result for correctness: ");
|
||||||
|
@ -907,8 +949,8 @@ int MatrixMultiply(int argc, char **argv,
|
||||||
double rel_err = abs_err / abs_val / dot_length;
|
double rel_err = abs_err / abs_val / dot_length;
|
||||||
|
|
||||||
if (rel_err > eps) {
|
if (rel_err > eps) {
|
||||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
|
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
|
||||||
i, h_C[i], dimsA.x * valB, eps);
|
h_C[i], dimsA.x * valB, eps);
|
||||||
correct = false;
|
correct = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -924,7 +966,8 @@ int MatrixMultiply(int argc, char **argv,
|
||||||
checkCudaErrors(cudaFree(d_C));
|
checkCudaErrors(cudaFree(d_C));
|
||||||
checkCudaErrors(cudaEventDestroy(start));
|
checkCudaErrors(cudaEventDestroy(start));
|
||||||
checkCudaErrors(cudaEventDestroy(stop));
|
checkCudaErrors(cudaEventDestroy(stop));
|
||||||
printf("\nNOTE: The CUDA Samples are not meant for performance "\
|
printf(
|
||||||
|
"\nNOTE: The CUDA Samples are not meant for performance "
|
||||||
"measurements. Results may vary when GPU Boost is enabled.\n");
|
"measurements. Results may vary when GPU Boost is enabled.\n");
|
||||||
|
|
||||||
if (correct) {
|
if (correct) {
|
||||||
|
@ -934,7 +977,6 @@ int MatrixMultiply(int argc, char **argv,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
printf("[globalToShmemAsyncCopy] - Starting...\n");
|
printf("[globalToShmemAsyncCopy] - Starting...\n");
|
||||||
|
|
||||||
|
@ -943,11 +985,20 @@ int main(int argc, char **argv) {
|
||||||
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
||||||
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
||||||
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
||||||
printf(" -kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - AsyncCopyLargeChunk)\n");
|
printf(
|
||||||
printf(" (2 - AsyncCopyLargeChunkAWBarrier; 3 - AsyncCopyMultiStageSharedState)\n");
|
" -kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - "
|
||||||
printf(" (4 - AsyncCopyMultiStage; 5 - AsyncCopySingleStage; 6 - Naive without memcpy_async)\n");
|
"AsyncCopyLargeChunk)\n");
|
||||||
printf(" (7 - NaiveLargeChunk without memcpy_async)\n");
|
printf(
|
||||||
printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
|
" (2 - AsyncCopyLargeChunkAWBarrier; 3 - "
|
||||||
|
"AsyncCopyMultiStageSharedState)\n");
|
||||||
|
printf(
|
||||||
|
" (4 - AsyncCopyMultiStage; 5 - "
|
||||||
|
"AsyncCopySingleStage; 6 - Naive without memcpy_async)\n");
|
||||||
|
printf(
|
||||||
|
" (7 - NaiveLargeChunk without "
|
||||||
|
"memcpy_async)\n");
|
||||||
|
printf(
|
||||||
|
" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
|
||||||
|
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
|
@ -990,31 +1041,31 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
// kernel to run - default (AsyncCopyMultiStageLargeChunk == 0)
|
// kernel to run - default (AsyncCopyMultiStageLargeChunk == 0)
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) {
|
||||||
int kernel_number = getCmdLineArgumentInt(argc, (const char **)argv, "kernel");
|
int kernel_number =
|
||||||
if (kernel_number < 8)
|
getCmdLineArgumentInt(argc, (const char **)argv, "kernel");
|
||||||
{
|
if (kernel_number < 8) {
|
||||||
selected_kernel = (kernels)kernel_number;
|
selected_kernel = (kernels)kernel_number;
|
||||||
}
|
} else {
|
||||||
else
|
printf(
|
||||||
{
|
"Error: kernel number should be between 0 to 6, you have entered "
|
||||||
printf("Error: kernel number should be between 0 to 6, you have entered %d\n", kernel_number);
|
"%d\n",
|
||||||
|
kernel_number);
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int major = 0;
|
int major = 0;
|
||||||
checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
|
checkCudaErrors(
|
||||||
if (major < 7)
|
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
|
||||||
{
|
if (major < 7) {
|
||||||
printf("globalToShmemAsyncCopy requires SM 7.0 or higher. Exiting...\n");
|
printf("globalToShmemAsyncCopy requires SM 7.0 or higher. Exiting...\n");
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
|
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
|
||||||
dimsB.x, dimsB.y);
|
dimsB.y);
|
||||||
|
|
||||||
int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB, selected_kernel);
|
int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB, selected_kernel);
|
||||||
|
|
||||||
exit(matrix_result);
|
exit(matrix_result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -25,7 +25,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch,
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -109,6 +109,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -105,6 +105,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -100,8 +100,10 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
double *b = NULL;
|
double *b = NULL;
|
||||||
float *A = NULL;
|
float *A = NULL;
|
||||||
b = (double *)calloc(N_ROWS, sizeof(double));
|
checkCudaErrors(cudaMallocHost(&b, N_ROWS * sizeof(double)));
|
||||||
A = (float *)calloc(N_ROWS * N_ROWS, sizeof(float));
|
memset(b, 0, N_ROWS * sizeof(double));
|
||||||
|
checkCudaErrors(cudaMallocHost(&A, N_ROWS * N_ROWS * sizeof(float)));
|
||||||
|
memset(A, 0, N_ROWS * N_ROWS * sizeof(float));
|
||||||
|
|
||||||
createLinearSystem(A, b);
|
createLinearSystem(A, b);
|
||||||
double *x = NULL;
|
double *x = NULL;
|
||||||
|
@ -170,6 +172,9 @@ int main(int argc, char **argv) {
|
||||||
checkCudaErrors(cudaFree(d_x));
|
checkCudaErrors(cudaFree(d_x));
|
||||||
checkCudaErrors(cudaFree(d_x_new));
|
checkCudaErrors(cudaFree(d_x_new));
|
||||||
|
|
||||||
|
checkCudaErrors(cudaFreeHost(A));
|
||||||
|
checkCudaErrors(cudaFreeHost(b));
|
||||||
|
|
||||||
printf("&&&& jacobiCudaGraphs %s\n",
|
printf("&&&& jacobiCudaGraphs %s\n",
|
||||||
(fabs(sum - sumGPU) < conv_threshold) ? "PASSED" : "FAILED");
|
(fabs(sum - sumGPU) < conv_threshold) ? "PASSED" : "FAILED");
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -112,6 +112,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -108,6 +108,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -302,14 +302,10 @@ LIBRARIES :=
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
FATBIN_FILE := memMapIpc_kernel${TARGET_SIZE}.fatbin
|
PTX_FILE := memMapIpc_kernel${TARGET_SIZE}.ptx
|
||||||
|
|
||||||
# Gencode arguments
|
# Gencode arguments
|
||||||
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
|
SMS ?=
|
||||||
SMS ?= 35 37 50 52 60 61 70 72 75 80 86
|
|
||||||
else
|
|
||||||
SMS ?= 35 37 50 52 60 61 70 75 80 86
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(GENCODE_FLAGS),)
|
ifeq ($(GENCODE_FLAGS),)
|
||||||
# Generate SASS code for each SM architecture listed in $(SMS)
|
# Generate SASS code for each SM architecture listed in $(SMS)
|
||||||
|
@ -395,7 +391,7 @@ endif
|
||||||
# Target rules
|
# Target rules
|
||||||
all: build
|
all: build
|
||||||
|
|
||||||
build: memMapIPCDrv $(FATBIN_FILE)
|
build: memMapIPCDrv $(PTX_FILE)
|
||||||
|
|
||||||
check.deps:
|
check.deps:
|
||||||
ifeq ($(SAMPLE_ENABLED),0)
|
ifeq ($(SAMPLE_ENABLED),0)
|
||||||
|
@ -404,8 +400,8 @@ else
|
||||||
@echo "Sample is ready - all dependencies have been met"
|
@echo "Sample is ready - all dependencies have been met"
|
||||||
endif
|
endif
|
||||||
|
|
||||||
$(FATBIN_FILE): memMapIpc_kernel.cu
|
$(PTX_FILE): memMapIpc_kernel.cu
|
||||||
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -fatbin $<
|
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -ptx $<
|
||||||
$(EXEC) mkdir -p data
|
$(EXEC) mkdir -p data
|
||||||
$(EXEC) cp -f $@ ./data
|
$(EXEC) cp -f $@ ./data
|
||||||
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||||
|
@ -426,9 +422,8 @@ run: build
|
||||||
$(EXEC) ./memMapIPCDrv
|
$(EXEC) ./memMapIPCDrv
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f memMapIPCDrv helper_multiprocess.o memMapIpc.o data/$(FATBIN_FILE) $(FATBIN_FILE)
|
rm -f memMapIPCDrv helper_multiprocess.o memMapIpc.o data/$(PTX_FILE) $(PTX_FILE)
|
||||||
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/memMapIPCDrv
|
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/memMapIPCDrv
|
||||||
|
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/$(PTX_FILE)
|
||||||
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/$(FATBIN_FILE)
|
|
||||||
|
|
||||||
clobber: clean
|
clobber: clean
|
||||||
|
|
|
@ -30,7 +30,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuLaunchKernel, cuMemcpyD
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -67,7 +67,7 @@
|
||||||
<OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
|
<OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
|
||||||
</Link>
|
</Link>
|
||||||
<CudaCompile>
|
<CudaCompile>
|
||||||
<CodeGeneration>compute_35,compute_35;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
|
<CodeGeneration>compute_35,compute_35;</CodeGeneration>
|
||||||
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
|
||||||
<Include>./;../../Common</Include>
|
<Include>./;../../Common</Include>
|
||||||
<Defines>WIN32</Defines>
|
<Defines>WIN32</Defines>
|
||||||
|
@ -105,14 +105,14 @@
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="memMapIpc.cpp" />
|
<ClCompile Include="memMapIpc.cpp" />
|
||||||
<CudaCompile Include="memMapIpc_kernel.cu">
|
<CudaCompile Include="memMapIpc_kernel.cu">
|
||||||
<CompileOut Condition="'$(Platform)'=='x64'">data/%(Filename)64.fatbin</CompileOut>
|
<CompileOut Condition="'$(Platform)'=='x64'">data/%(Filename)64.ptx</CompileOut>
|
||||||
<NvccCompilation>fatbin</NvccCompilation>
|
<NvccCompilation>ptx</NvccCompilation>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<ClCompile Include="../../Common/helper_multiprocess.cpp" />
|
<ClCompile Include="../../Common/helper_multiprocess.cpp" />
|
||||||
<ClInclude Include="../../Common/helper_multiprocess.h" />
|
<ClInclude Include="../../Common/helper_multiprocess.h" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -63,7 +63,7 @@
|
||||||
<OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
|
<OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
|
||||||
</Link>
|
</Link>
|
||||||
<CudaCompile>
|
<CudaCompile>
|
||||||
<CodeGeneration>compute_35,compute_35;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
|
<CodeGeneration>compute_35,compute_35;</CodeGeneration>
|
||||||
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
|
||||||
<Include>./;../../Common</Include>
|
<Include>./;../../Common</Include>
|
||||||
<Defines>WIN32</Defines>
|
<Defines>WIN32</Defines>
|
||||||
|
@ -101,14 +101,14 @@
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="memMapIpc.cpp" />
|
<ClCompile Include="memMapIpc.cpp" />
|
||||||
<CudaCompile Include="memMapIpc_kernel.cu">
|
<CudaCompile Include="memMapIpc_kernel.cu">
|
||||||
<CompileOut Condition="'$(Platform)'=='x64'">data/%(Filename)64.fatbin</CompileOut>
|
<CompileOut Condition="'$(Platform)'=='x64'">data/%(Filename)64.ptx</CompileOut>
|
||||||
<NvccCompilation>fatbin</NvccCompilation>
|
<NvccCompilation>ptx</NvccCompilation>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<ClCompile Include="../../Common/helper_multiprocess.cpp" />
|
<ClCompile Include="../../Common/helper_multiprocess.cpp" />
|
||||||
<ClInclude Include="../../Common/helper_multiprocess.h" />
|
<ClInclude Include="../../Common/helper_multiprocess.h" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -64,9 +64,13 @@ typedef struct shmStruct_st {
|
||||||
int sense;
|
int sense;
|
||||||
} shmStruct;
|
} shmStruct;
|
||||||
|
|
||||||
// define input fatbin file
|
bool findModulePath(const char *, string &, char **, string &);
|
||||||
#ifndef FATBIN_FILE
|
|
||||||
#define FATBIN_FILE "memMapIpc_kernel64.fatbin"
|
// define input ptx file for different platforms
|
||||||
|
#if defined(_WIN64) || defined(__LP64__)
|
||||||
|
#define PTX_FILE "memMapIpc_kernel64.ptx"
|
||||||
|
#else
|
||||||
|
#define PTX_FILE "memMapIpc_kernel32.ptx"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// `ipcHandleTypeFlag` specifies the platform specific handle type this sample
|
// `ipcHandleTypeFlag` specifies the platform specific handle type this sample
|
||||||
|
@ -255,23 +259,44 @@ static void memMapUnmapAndFreeMemory(CUdeviceptr dptr, size_t size) {
|
||||||
|
|
||||||
static void memMapGetDeviceFunction(char **argv) {
|
static void memMapGetDeviceFunction(char **argv) {
|
||||||
// first search for the module path before we load the results
|
// first search for the module path before we load the results
|
||||||
string module_path;
|
string module_path, ptx_source;
|
||||||
std::ostringstream fatbin;
|
if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) {
|
||||||
|
if (!findModulePath("memMapIpc_kernel.cubin", module_path, argv,
|
||||||
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
ptx_source)) {
|
||||||
|
printf(
|
||||||
|
"> findModulePath could not find <simpleMemMapIpc> ptx or cubin\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!fatbin.str().size()) {
|
// Create module from binary file (PTX or CUBIN)
|
||||||
printf("fatbin file empty. exiting..\n");
|
if (module_path.rfind("ptx") != string::npos) {
|
||||||
exit(EXIT_FAILURE);
|
// in this branch we use compilation with parameters
|
||||||
|
const unsigned int jitNumOptions = 3;
|
||||||
|
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
|
||||||
|
void **jitOptVals = new void *[jitNumOptions];
|
||||||
|
// set up size of compilation log buffer
|
||||||
|
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
||||||
|
int jitLogBufferSize = 1024;
|
||||||
|
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
|
||||||
|
// set up pointer to the compilation log buffer
|
||||||
|
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
|
||||||
|
char *jitLogBuffer = new char[jitLogBufferSize];
|
||||||
|
jitOptVals[1] = jitLogBuffer;
|
||||||
|
// set up pointer to set the Maximum # of registers for a particular kernel
|
||||||
|
jitOptions[2] = CU_JIT_MAX_REGISTERS;
|
||||||
|
int jitRegCount = 32;
|
||||||
|
jitOptVals[2] = (void *)(size_t)jitRegCount;
|
||||||
|
checkCudaErrors(cuModuleLoadDataEx(&cuModule, ptx_source.c_str(),
|
||||||
|
jitNumOptions, jitOptions,
|
||||||
|
(void **)jitOptVals));
|
||||||
|
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
|
||||||
|
} else {
|
||||||
|
checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create module from binary file (FATBIN)
|
|
||||||
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
|
||||||
|
|
||||||
// Get function handle from module
|
// Get function handle from module
|
||||||
checkCudaErrors(
|
checkCudaErrors(
|
||||||
cuModuleGetFunction(&_memMapIpc_kernel, cuModule, "memMapIpc_kernel"));
|
cuModuleGetFunction(&_memMapIpc_kernel, cuModule, "memMapIpc_kernel"));
|
||||||
|
@ -585,3 +610,37 @@ int main(int argc, char **argv) {
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool inline findModulePath(const char *module_file, string &module_path,
|
||||||
|
char **argv, string &ptx_source) {
|
||||||
|
char *actual_path = sdkFindFilePath(module_file, argv[0]);
|
||||||
|
|
||||||
|
if (actual_path) {
|
||||||
|
module_path = actual_path;
|
||||||
|
} else {
|
||||||
|
printf("> findModulePath file not found: <%s> \n", module_file);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (module_path.empty()) {
|
||||||
|
printf("> findModulePath could not find file: <%s> \n", module_file);
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
printf("> findModulePath found file at <%s>\n", module_path.c_str());
|
||||||
|
|
||||||
|
if (module_path.rfind(".ptx") != string::npos) {
|
||||||
|
FILE *fp = fopen(module_path.c_str(), "rb");
|
||||||
|
fseek(fp, 0, SEEK_END);
|
||||||
|
int file_size = ftell(fp);
|
||||||
|
char *buf = new char[file_size + 1];
|
||||||
|
fseek(fp, 0, SEEK_SET);
|
||||||
|
fread(buf, sizeof(char), file_size, fp);
|
||||||
|
fclose(fp);
|
||||||
|
buf[file_size] = '\0';
|
||||||
|
ptx_source = buf;
|
||||||
|
delete[] buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user