add and update samples with CUDA 11.3 support

This commit is contained in:
Rutwik Choughule 2021-04-16 11:54:26 +05:30
parent 067cb65523
commit 568b39bd5b
214 changed files with 6590 additions and 3856 deletions

View File

@ -1,11 +1,17 @@
# CUDA Samples # CUDA Samples
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads). Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads).
## Release Notes ## Release Notes
This section describes the release notes for the CUDA Samples on GitHub only. This section describes the release notes for the CUDA Samples on GitHub only.
### CUDA 11.3
* Added `streamOrderedAllocationIPC`. Demonstrates Inter Process Communication using one process per GPU for computation.
* Added `simpleCUBLAS_LU`. Demonstrates batched matrix LU decomposition using cuBLAS API `cublas<t>getrfBatched()`
* Updated `simpleVulkan`. Demonstrates use of timeline semaphore.
* Updated multiple samples to use pinned memory using `cudaMallocHost()`.
### CUDA 11.2 ### CUDA 11.2
* Added `streamOrderedAllocation`. Demonstrates stream ordered memory allocation on a GPU using cudaMallocAsync and cudaMemPool family of APIs. * Added `streamOrderedAllocation`. Demonstrates stream ordered memory allocation on a GPU using cudaMallocAsync and cudaMemPool family of APIs.
* Added `streamOrderedAllocationP2P`. Demonstrates peer-to-peer access of stream ordered memory allocated using cudaMallocAsync and cudaMemPool family of APIs. * Added `streamOrderedAllocationP2P`. Demonstrates peer-to-peer access of stream ordered memory allocated using cudaMallocAsync and cudaMemPool family of APIs.
@ -103,7 +109,7 @@ This is the first release of CUDA Samples on GitHub:
### Prerequisites ### Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html). For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).
### Getting the CUDA Samples ### Getting the CUDA Samples
@ -160,38 +166,39 @@ The samples makefiles can take advantage of certain options:
### Samples by OS ### Samples by OS
#### Linux #### Linux
**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** |
---|---|---|---| ---|---|---|---|
**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
**[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[cudaNvSci](./Samples/cudaNvSci)** |
**[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[cudaNvSciNvMedia](./Samples/cudaNvSciNvMedia)** | **[nvJPEG](./Samples/nvJPEG)** | **[cudaNvSciNvMedia](./Samples/cudaNvSciNvMedia)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** |
**[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[deviceQuery](./Samples/deviceQuery)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** |
**[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[matrixMul](./Samples/matrixMul)** |
**[cudaNvSci](./Samples/cudaNvSci)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** |
**[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[nvJPEG](./Samples/nvJPEG)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[reduction](./Samples/reduction)** |
**[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** |
**[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[simpleCUFFT](./Samples/simpleCUFFT)** |
**[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[simpleGL](./Samples/simpleGL)** | **[simpleIPC](./Samples/simpleIPC)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** |
**[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[simpleGL](./Samples/simpleGL)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** |
**[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[streamOrderedAllocationIPC](./Samples/streamOrderedAllocationIPC)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** |
**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** |
**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** |
#### Windows #### Windows
**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** |
---|---|---|---| ---|---|---|---|
**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
**[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[cudaOpenMP](./Samples/cudaOpenMP)** |
**[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[nvJPEG](./Samples/nvJPEG)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[simpleD3D12](./Samples/simpleD3D12)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[deviceQuery](./Samples/deviceQuery)** |
**[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[deviceQuery](./Samples/deviceQuery)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** |
**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[matrixMul](./Samples/matrixMul)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** |
**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[nvJPEG](./Samples/nvJPEG)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** |
**[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[reduction](./Samples/reduction)** | **[shfl_scan](./Samples/shfl_scan)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** |
**[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** |
**[simpleD3D11](./Samples/simpleD3D11)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[simpleD3D11](./Samples/simpleD3D11)** | **[simpleD3D12](./Samples/simpleD3D12)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** |
**[bandwidthTest](./Samples/bandwidthTest)** | **[concurrentKernels](./Samples/concurrentKernels)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[simpleGL](./Samples/simpleGL)** | **[simpleIPC](./Samples/simpleIPC)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleVulkan](./Samples/simpleVulkan)** |
**[simpleGL](./Samples/simpleGL)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** |
**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** |
**[cudaOpenMP](./Samples/cudaOpenMP)** | **[matrixMul](./Samples/matrixMul)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** |
## Dependencies ## Dependencies

View File

@ -285,6 +285,12 @@ ifeq ($(TARGET_OS),android)
SAMPLE_ENABLED := 0 SAMPLE_ENABLED := 0
endif endif
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - EGLStream_CUDA_Interop is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS := ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))

View File

@ -30,7 +30,7 @@ cuDeviceGet, cuDeviceGetAttribute, cuDeviceComputeCapability, cuDeviceGetCount,
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -263,6 +263,14 @@ ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - MersenneTwisterGP11213 is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS := ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@ -297,6 +305,10 @@ ALL_CCFLAGS += --threads 0
LIBRARIES += -lcurand_static -lculibos LIBRARIES += -lcurand_static -lculibos
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################ ################################################################################
# Target rules # Target rules
@ -304,16 +316,23 @@ all: build
build: MersenneTwisterGP11213 build: MersenneTwisterGP11213
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
MersenneTwister.o:MersenneTwister.cpp MersenneTwister.o:MersenneTwister.cpp
$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
MersenneTwisterGP11213: MersenneTwister.o MersenneTwisterGP11213: MersenneTwister.o
$(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build run: build
./MersenneTwisterGP11213 $(EXEC) ./MersenneTwisterGP11213
clean: clean:
rm -f MersenneTwisterGP11213 MersenneTwister.o rm -f MersenneTwisterGP11213 MersenneTwister.o

View File

@ -47,138 +47,134 @@
float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU); float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU);
const int DEFAULT_RAND_N = 2400000; const int DEFAULT_RAND_N = 2400000;
const unsigned int DEFAULT_SEED = 777; const unsigned int DEFAULT_SEED = 777;
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Main program // Main program
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) int main(int argc, char **argv) {
{ // Start logs
// Start logs printf("%s Starting...\n\n", argv[0]);
printf("%s Starting...\n\n", argv[0]);
// initialize the GPU, either identified by --device // initialize the GPU, either identified by --device
// or by picking the device with highest flop rate. // or by picking the device with highest flop rate.
int devID = findCudaDevice(argc, (const char **)argv); int devID = findCudaDevice(argc, (const char **)argv);
// parsing the number of random numbers to generate // parsing the number of random numbers to generate
int rand_n = DEFAULT_RAND_N; int rand_n = DEFAULT_RAND_N;
if (checkCmdLineFlag(argc, (const char **) argv, "count")) if (checkCmdLineFlag(argc, (const char **)argv, "count")) {
{ rand_n = getCmdLineArgumentInt(argc, (const char **)argv, "count");
rand_n = getCmdLineArgumentInt(argc, (const char **) argv, "count"); }
}
printf("Allocating data for %i samples...\n", rand_n); printf("Allocating data for %i samples...\n", rand_n);
// parsing the seed // parsing the seed
int seed = DEFAULT_SEED; int seed = DEFAULT_SEED;
if (checkCmdLineFlag(argc, (const char **) argv, "seed")) if (checkCmdLineFlag(argc, (const char **)argv, "seed")) {
{ seed = getCmdLineArgumentInt(argc, (const char **)argv, "seed");
seed = getCmdLineArgumentInt(argc, (const char **) argv, "seed"); }
}
printf("Seeding with %i ...\n", seed); printf("Seeding with %i ...\n", seed);
cudaStream_t stream; cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
float *d_Rand; float *d_Rand;
checkCudaErrors(cudaMalloc((void **)&d_Rand, rand_n * sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_Rand, rand_n * sizeof(float)));
curandGenerator_t prngGPU; curandGenerator_t prngGPU;
checkCudaErrors(curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32)); checkCudaErrors(curandCreateGenerator(&prngGPU, CURAND_RNG_PSEUDO_MTGP32));
checkCudaErrors(curandSetStream(prngGPU, stream)); checkCudaErrors(curandSetStream(prngGPU, stream));
checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngGPU, seed)); checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngGPU, seed));
curandGenerator_t prngCPU; curandGenerator_t prngCPU;
checkCudaErrors(curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32)); checkCudaErrors(
checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngCPU, seed)); curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32));
checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngCPU, seed));
// //
// Example 1: Compare random numbers generated on GPU and CPU // Example 1: Compare random numbers generated on GPU and CPU
float *h_RandGPU = (float *)malloc(rand_n * sizeof(float)); float *h_RandGPU;
checkCudaErrors(cudaMallocHost(&h_RandGPU, rand_n * sizeof(float)));
printf("Generating random numbers on GPU...\n\n"); printf("Generating random numbers on GPU...\n\n");
checkCudaErrors(curandGenerateUniform(prngGPU, (float *) d_Rand, rand_n)); checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n));
printf("\nReading back the results...\n"); printf("\nReading back the results...\n");
checkCudaErrors(cudaMemcpyAsync(h_RandGPU, d_Rand, rand_n * sizeof(float), cudaMemcpyDeviceToHost, stream)); checkCudaErrors(cudaMemcpyAsync(h_RandGPU, d_Rand, rand_n * sizeof(float),
cudaMemcpyDeviceToHost, stream));
float *h_RandCPU = (float *)malloc(rand_n * sizeof(float));
float *h_RandCPU = (float *)malloc(rand_n * sizeof(float)); printf("Generating random numbers on CPU...\n\n");
checkCudaErrors(curandGenerateUniform(prngCPU, (float *)h_RandCPU, rand_n));
printf("Generating random numbers on CPU...\n\n"); checkCudaErrors(cudaStreamSynchronize(stream));
checkCudaErrors(curandGenerateUniform(prngCPU, (float *) h_RandCPU, rand_n)); printf("Comparing CPU/GPU random numbers...\n\n");
float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU);
checkCudaErrors(cudaStreamSynchronize(stream)); //
printf("Comparing CPU/GPU random numbers...\n\n"); // Example 2: Timing of random number generation on GPU
float L1norm = compareResults(rand_n, h_RandGPU, h_RandCPU); const int numIterations = 10;
int i;
StopWatchInterface *hTimer;
// sdkCreateTimer(&hTimer);
// Example 2: Timing of random number generation on GPU sdkResetTimer(&hTimer);
const int numIterations = 10; sdkStartTimer(&hTimer);
int i;
StopWatchInterface *hTimer;
sdkCreateTimer(&hTimer); for (i = 0; i < numIterations; i++) {
sdkResetTimer(&hTimer); checkCudaErrors(curandGenerateUniform(prngGPU, (float *)d_Rand, rand_n));
sdkStartTimer(&hTimer); }
for (i = 0; i < numIterations; i++) checkCudaErrors(cudaStreamSynchronize(stream));
{ sdkStopTimer(&hTimer);
checkCudaErrors(curandGenerateUniform(prngGPU, (float *) d_Rand, rand_n));
}
checkCudaErrors(cudaStreamSynchronize(stream)); double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer) / (double)numIterations;
sdkStopTimer(&hTimer);
double gpuTime = 1.0e-3 * sdkGetTimerValue(&hTimer)/(double)numIterations; printf(
"MersenneTwisterGP11213, Throughput = %.4f GNumbers/s, Time = %.5f s, "
"Size = %u Numbers\n",
1.0e-9 * rand_n / gpuTime, gpuTime, rand_n);
printf("MersenneTwisterGP11213, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers\n", printf("Shutting down...\n");
1.0e-9 * rand_n / gpuTime, gpuTime, rand_n);
printf("Shutting down...\n"); checkCudaErrors(curandDestroyGenerator(prngGPU));
checkCudaErrors(curandDestroyGenerator(prngCPU));
checkCudaErrors(cudaStreamDestroy(stream));
checkCudaErrors(cudaFree(d_Rand));
sdkDeleteTimer(&hTimer);
checkCudaErrors(cudaFreeHost(h_RandGPU));
free(h_RandCPU);
checkCudaErrors(curandDestroyGenerator(prngGPU)); exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE);
checkCudaErrors(curandDestroyGenerator(prngCPU));
checkCudaErrors(cudaStreamDestroy(stream));
checkCudaErrors(cudaFree(d_Rand));
sdkDeleteTimer(&hTimer);
free(h_RandGPU);
free(h_RandCPU);
exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE);
} }
float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU) {
int i;
float rCPU, rGPU, delta;
float max_delta = 0.;
float sum_delta = 0.;
float sum_ref = 0.;
float compareResults(int rand_n, float *h_RandGPU, float *h_RandCPU) for (i = 0; i < rand_n; i++) {
{ rCPU = h_RandCPU[i];
int i; rGPU = h_RandGPU[i];
float rCPU, rGPU, delta; delta = fabs(rCPU - rGPU);
float max_delta = 0.; sum_delta += delta;
float sum_delta = 0.; sum_ref += fabs(rCPU);
float sum_ref = 0.;
for (i = 0; i < rand_n; i++) if (delta >= max_delta) {
{ max_delta = delta;
rCPU = h_RandCPU[i];
rGPU = h_RandGPU[i];
delta = fabs(rCPU - rGPU);
sum_delta += delta;
sum_ref += fabs(rCPU);
if (delta >= max_delta)
{
max_delta = delta;
}
} }
}
float L1norm = (float)(sum_delta / sum_ref); float L1norm = (float)(sum_delta / sum_ref);
printf("Max absolute error: %E\n", max_delta); printf("Max absolute error: %E\n", max_delta);
printf("L1 norm: %E\n\n", L1norm); printf("L1 norm: %E\n\n", L1norm);
return L1norm; return L1norm;
} }

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -113,6 +113,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -109,6 +109,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -27,7 +27,7 @@ cudaMemcpy2D, cudaMallocManaged
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -28,7 +28,7 @@ cudaMallocManaged, cudaStreamAttachMemAsync, cudaMemcpyAsync, cudaMallocHost, cu
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -111,6 +111,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -107,6 +107,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -27,7 +27,7 @@ cudaSetDevice, cudaHostAlloc, cudaFree, cudaMallocHost, cudaFreeHost, cudaMemcpy
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -271,6 +271,12 @@ ifeq ($(TARGET_OS),darwin)
SAMPLE_ENABLED := 0 SAMPLE_ENABLED := 0
endif endif
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - batchedLabelMarkersAndLabelCompressionNPP is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS := ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))

View File

@ -28,7 +28,7 @@ x86_64, ppc64le, armv7l
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -36,11 +36,13 @@
#include <string.h> #include <string.h>
#include <fstream> #include <fstream>
#include <cuda_runtime.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_string.h>
#include <npp.h> #include <npp.h>
// Note: If you want to view these images we HIGHLY recommend using imagej // Note: If you want to view these images we HIGHLY recommend using imagej
// which is free on the internet and works on most platforms // which is free on the internet and works on most platforms
// because it is one of the few image viewing apps that can display 32 // because it is one of the few image viewing apps that can display 32
// bit integer image data. While it normalizes the data to floating // bit integer image data. While it normalizes the data to floating
// point values for viewing it still provides a good representation of // point values for viewing it still provides a good representation of
@ -102,11 +104,12 @@ void tearDown() // Clean up and tear down
if (pUFBatchSrcDstImageListDev != 0) cudaFree(pUFBatchSrcDstImageListDev); if (pUFBatchSrcDstImageListDev != 0) cudaFree(pUFBatchSrcDstImageListDev);
if (pUFBatchSrcImageListDev != 0) cudaFree(pUFBatchSrcImageListDev); if (pUFBatchSrcImageListDev != 0) cudaFree(pUFBatchSrcImageListDev);
if (pUFBatchPerImageCompressedCountListHost != 0) if (pUFBatchPerImageCompressedCountListHost != 0)
free(pUFBatchPerImageCompressedCountListHost); cudaFreeHost(pUFBatchPerImageCompressedCountListHost);
if (pUFBatchSrcDstScratchBufferListHost != 0) if (pUFBatchSrcDstScratchBufferListHost != 0)
free(pUFBatchSrcDstScratchBufferListHost); cudaFreeHost(pUFBatchSrcDstScratchBufferListHost);
if (pUFBatchSrcDstImageListHost != 0) free(pUFBatchSrcDstImageListHost); if (pUFBatchSrcDstImageListHost != 0)
if (pUFBatchSrcImageListHost != 0) free(pUFBatchSrcImageListHost); cudaFreeHost(pUFBatchSrcDstImageListHost);
if (pUFBatchSrcImageListHost != 0) cudaFreeHost(pUFBatchSrcImageListHost);
for (int j = 0; j < NUMBER_OF_IMAGES; j++) { for (int j = 0; j < NUMBER_OF_IMAGES; j++) {
if (pUFCompressedLabelsScratchBufferDev[j] != 0) if (pUFCompressedLabelsScratchBufferDev[j] != 0)
@ -115,8 +118,8 @@ void tearDown() // Clean up and tear down
cudaFree(pUFGenerateLabelsScratchBufferDev[j]); cudaFree(pUFGenerateLabelsScratchBufferDev[j]);
if (pUFLabelDev[j] != 0) cudaFree(pUFLabelDev[j]); if (pUFLabelDev[j] != 0) cudaFree(pUFLabelDev[j]);
if (pInputImageDev[j] != 0) cudaFree(pInputImageDev[j]); if (pInputImageDev[j] != 0) cudaFree(pInputImageDev[j]);
if (pUFLabelHost[j] != 0) free(pUFLabelHost[j]); if (pUFLabelHost[j] != 0) cudaFreeHost(pUFLabelHost[j]);
if (pInputImageHost[j] != 0) free(pInputImageHost[j]); if (pInputImageHost[j] != 0) cudaFreeHost(pInputImageHost[j]);
} }
} }
@ -177,7 +180,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
} }
bmpFile = fopen(InputFile, "rb"); FOPEN(bmpFile, InputFile, "rb");
} else if (nImage == 1) { } else if (nImage == 1) {
if (nWidth != 512 || nHeight != 512) return -1; if (nWidth != 512 || nHeight != 512) return -1;
const char *fileName = "CT_skull_512x512_8u.raw"; const char *fileName = "CT_skull_512x512_8u.raw";
@ -187,7 +190,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
} }
bmpFile = fopen(InputFile, "rb"); FOPEN(bmpFile, InputFile, "rb");
} else if (nImage == 2) { } else if (nImage == 2) {
if (nWidth != 509 || nHeight != 335) return -1; if (nWidth != 509 || nHeight != 335) return -1;
const char *fileName = "PCB_METAL_509x335_8u.raw"; const char *fileName = "PCB_METAL_509x335_8u.raw";
@ -197,7 +200,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
} }
bmpFile = fopen(InputFile, "rb"); FOPEN(bmpFile, InputFile, "rb");
} else if (nImage == 3) { } else if (nImage == 3) {
if (nWidth != 1024 || nHeight != 683) return -1; if (nWidth != 1024 || nHeight != 683) return -1;
const char *fileName = "PCB2_1024x683_8u.raw"; const char *fileName = "PCB2_1024x683_8u.raw";
@ -207,7 +210,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
} }
bmpFile = fopen(InputFile, "rb"); FOPEN(bmpFile, InputFile, "rb");
} else if (nImage == 4) { } else if (nImage == 4) {
if (nWidth != 1280 || nHeight != 720) return -1; if (nWidth != 1280 || nHeight != 720) return -1;
const char *fileName = "PCB_1280x720_8u.raw"; const char *fileName = "PCB_1280x720_8u.raw";
@ -217,7 +220,7 @@ int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
} }
bmpFile = fopen(InputFile, "rb"); FOPEN(bmpFile, InputFile, "rb");
} else { } else {
printf("Input file load failed.\n"); printf("Input file load failed.\n");
return -1; return -1;
@ -347,9 +350,11 @@ int main(int argc, char **argv) {
oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height); oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height);
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
pInputImageHost[nImage] = reinterpret_cast<Npp8u *>(malloc( checkCudaErrors(cudaMallocHost(
&(pInputImageHost[nImage]),
oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height)); oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height));
pUFLabelHost[nImage] = reinterpret_cast<Npp32u *>(malloc( checkCudaErrors(cudaMallocHost(
&(pUFLabelHost[nImage]),
oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height)); oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height));
// Use UF functions throughout this sample. // Use UF functions throughout this sample.
@ -409,15 +414,15 @@ int main(int argc, char **argv) {
} }
if (nImage == 0) if (nImage == 0)
bmpFile = fopen(LabelMarkersOutputFile0.c_str(), "wb"); FOPEN(bmpFile, LabelMarkersOutputFile0.c_str(), "wb");
else if (nImage == 1) else if (nImage == 1)
bmpFile = fopen(LabelMarkersOutputFile1.c_str(), "wb"); FOPEN(bmpFile, LabelMarkersOutputFile1.c_str(), "wb");
else if (nImage == 2) else if (nImage == 2)
bmpFile = fopen(LabelMarkersOutputFile2.c_str(), "wb"); FOPEN(bmpFile, LabelMarkersOutputFile2.c_str(), "wb");
else if (nImage == 3) else if (nImage == 3)
bmpFile = fopen(LabelMarkersOutputFile3.c_str(), "wb"); FOPEN(bmpFile, LabelMarkersOutputFile3.c_str(), "wb");
else if (nImage == 4) else if (nImage == 4)
bmpFile = fopen(LabelMarkersOutputFile4.c_str(), "wb"); FOPEN(bmpFile, LabelMarkersOutputFile4.c_str(), "wb");
if (bmpFile == NULL) return -1; if (bmpFile == NULL) return -1;
size_t nSize = 0; size_t nSize = 0;
@ -478,15 +483,15 @@ int main(int argc, char **argv) {
} }
if (nImage == 0) if (nImage == 0)
bmpFile = fopen(CompressedMarkerLabelsOutputFile0.c_str(), "wb"); FOPEN(bmpFile, CompressedMarkerLabelsOutputFile0.c_str(), "wb");
else if (nImage == 1) else if (nImage == 1)
bmpFile = fopen(CompressedMarkerLabelsOutputFile1.c_str(), "wb"); FOPEN(bmpFile, CompressedMarkerLabelsOutputFile1.c_str(), "wb");
else if (nImage == 2) else if (nImage == 2)
bmpFile = fopen(CompressedMarkerLabelsOutputFile2.c_str(), "wb"); FOPEN(bmpFile, CompressedMarkerLabelsOutputFile2.c_str(), "wb");
else if (nImage == 3) else if (nImage == 3)
bmpFile = fopen(CompressedMarkerLabelsOutputFile3.c_str(), "wb"); FOPEN(bmpFile, CompressedMarkerLabelsOutputFile3.c_str(), "wb");
else if (nImage == 4) else if (nImage == 4)
bmpFile = fopen(CompressedMarkerLabelsOutputFile4.c_str(), "wb"); FOPEN(bmpFile, CompressedMarkerLabelsOutputFile4.c_str(), "wb");
if (bmpFile == NULL) return -1; if (bmpFile == NULL) return -1;
nSize = 0; nSize = 0;
@ -554,10 +559,11 @@ int main(int argc, char **argv) {
cudaMalloc((void **)&pUFBatchSrcDstImageListDev, nBatchImageListBytes); cudaMalloc((void **)&pUFBatchSrcDstImageListDev, nBatchImageListBytes);
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR; if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
pUFBatchSrcImageListHost = checkCudaErrors(
reinterpret_cast<NppiImageDescriptor *>(malloc(nBatchImageListBytes)); cudaMallocHost((void **)&pUFBatchSrcImageListHost, nBatchImageListBytes));
pUFBatchSrcDstImageListHost =
reinterpret_cast<NppiImageDescriptor *>(malloc(nBatchImageListBytes)); checkCudaErrors(cudaMallocHost((void **)&pUFBatchSrcDstImageListHost,
nBatchImageListBytes));
NppiSize oMaxROISize = {0, 0}; NppiSize oMaxROISize = {0, 0};
@ -620,15 +626,15 @@ int main(int argc, char **argv) {
// Save output to files // Save output to files
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
if (nImage == 0) if (nImage == 0)
bmpFile = fopen(LabelMarkersBatchOutputFile0.c_str(), "wb"); FOPEN(bmpFile, LabelMarkersBatchOutputFile0.c_str(), "wb");
else if (nImage == 1) else if (nImage == 1)
bmpFile = fopen(LabelMarkersBatchOutputFile1.c_str(), "wb"); FOPEN(bmpFile, LabelMarkersBatchOutputFile1.c_str(), "wb");
else if (nImage == 2) else if (nImage == 2)
bmpFile = fopen(LabelMarkersBatchOutputFile2.c_str(), "wb"); FOPEN(bmpFile, LabelMarkersBatchOutputFile2.c_str(), "wb");
else if (nImage == 3) else if (nImage == 3)
bmpFile = fopen(LabelMarkersBatchOutputFile3.c_str(), "wb"); FOPEN(bmpFile, LabelMarkersBatchOutputFile3.c_str(), "wb");
else if (nImage == 4) else if (nImage == 4)
bmpFile = fopen(LabelMarkersBatchOutputFile4.c_str(), "wb"); FOPEN(bmpFile, LabelMarkersBatchOutputFile4.c_str(), "wb");
if (bmpFile == NULL) return -1; if (bmpFile == NULL) return -1;
size_t nSize = 0; size_t nSize = 0;
@ -652,12 +658,13 @@ int main(int argc, char **argv) {
// Allocate host side scratch buffer point and size list and initialize with // Allocate host side scratch buffer point and size list and initialize with
// device scratch buffer pointers // device scratch buffer pointers
pUFBatchSrcDstScratchBufferListHost = checkCudaErrors(
reinterpret_cast<NppiBufferDescriptor *>( cudaMallocHost((void **)&pUFBatchSrcDstScratchBufferListHost,
malloc(NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor))); NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor)));
pUFBatchPerImageCompressedCountListHost = checkCudaErrors(
reinterpret_cast<Npp32u *>(malloc(NUMBER_OF_IMAGES * sizeof(Npp32u))); cudaMallocHost((void **)&pUFBatchPerImageCompressedCountListHost,
+NUMBER_OF_IMAGES * sizeof(Npp32u)));
// Start buffer pointer at beginning of full per image buffer list sized // Start buffer pointer at beginning of full per image buffer list sized
// pUFCompressedLabelsScratchBufferDev[0] // pUFCompressedLabelsScratchBufferDev[0]
@ -728,15 +735,15 @@ int main(int argc, char **argv) {
// Save compressed label images into files // Save compressed label images into files
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) { for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
if (nImage == 0) if (nImage == 0)
bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile0.c_str(), "wb"); FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile0.c_str(), "wb");
else if (nImage == 1) else if (nImage == 1)
bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile1.c_str(), "wb"); FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile1.c_str(), "wb");
else if (nImage == 2) else if (nImage == 2)
bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile2.c_str(), "wb"); FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile2.c_str(), "wb");
else if (nImage == 3) else if (nImage == 3)
bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile3.c_str(), "wb"); FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile3.c_str(), "wb");
else if (nImage == 4) else if (nImage == 4)
bmpFile = fopen(CompressedMarkerLabelsBatchOutputFile4.c_str(), "wb"); FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile4.c_str(), "wb");
if (bmpFile == NULL) return -1; if (bmpFile == NULL) return -1;
size_t nSize = 0; size_t nSize = 0;

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -31,14 +31,16 @@
* 1.) Each thread loads a value from random array. * 1.) Each thread loads a value from random array.
* 2.) then checks if it is odd or even. * 2.) then checks if it is odd or even.
* 3.) create binary partition group based on the above predicate * 3.) create binary partition group based on the above predicate
* 4.) we count the number of odd/even in the group based on size of the binary groups * 4.) we count the number of odd/even in the group based on size of the binary
groups
* 5.) write it global counter of odd. * 5.) write it global counter of odd.
* 6.) sum the values loaded by individual threads(using reduce) and write it to global * 6.) sum the values loaded by individual threads(using reduce) and write it to
* even & odd elements sum. global even & odd elements sum.
* *
* **NOTE** : binary_partition results in splitting warp into divergent thread groups * **NOTE** :
this is not good from performance perspective, but in cases where warp * binary_partition results in splitting warp into divergent thread groups
divergence is inevitable one can use binary_partition group. * this is not good from performance perspective, but in cases where warp
* divergence is inevitable one can use binary_partition group.
*/ */
#include <stdio.h> #include <stdio.h>
@ -48,108 +50,110 @@
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
void initOddEvenArr(int *inputArr, unsigned int size) void initOddEvenArr(int *inputArr, unsigned int size) {
{ for (int i = 0; i < size; i++) {
for (int i=0; i < size; i++) inputArr[i] = rand() % 50;
{ }
inputArr[i] = rand() % 50;
}
} }
/** /**
* CUDA kernel device code * CUDA kernel device code
* *
* Creates cooperative groups and performs odd/even counting & summation. * Creates cooperative groups and performs odd/even counting & summation.
*/ */
__global__ void oddEvenCountAndSumCG(int *inputArr, int *numOfOdds, int *sumOfOddAndEvens, unsigned int size) __global__ void oddEvenCountAndSumCG(int *inputArr, int *numOfOdds,
{ int *sumOfOddAndEvens, unsigned int size) {
cg::thread_block cta = cg::this_thread_block(); cg::thread_block cta = cg::this_thread_block();
cg::grid_group grid = cg::this_grid(); cg::grid_group grid = cg::this_grid();
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
for (int i = grid.thread_rank(); i < size; i += grid.size()) for (int i = grid.thread_rank(); i < size; i += grid.size()) {
int elem = inputArr[i];
auto subTile = cg::binary_partition(tile32, elem & 1);
if (elem & 1) // Odd numbers group
{ {
int elem = inputArr[i]; int oddGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
auto subTile = cg::binary_partition(tile32, elem & 1);
if (elem & 1) // Odd numbers group
{
int oddGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
if (subTile.thread_rank() == 0) if (subTile.thread_rank() == 0) {
{ // Add number of odds present in this group of Odds.
// Add number of odds present in this group of Odds. atomicAdd(numOfOdds, subTile.size());
atomicAdd(numOfOdds, subTile.size());
// Add local reduction of odds present in this group of Odds. // Add local reduction of odds present in this group of Odds.
atomicAdd(&sumOfOddAndEvens[0], oddGroupSum); atomicAdd(&sumOfOddAndEvens[0], oddGroupSum);
}
} else // Even numbers group
{
int evenGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
} if (subTile.thread_rank() == 0) {
} // Add local reduction of even present in this group of evens.
else // Even numbers group atomicAdd(&sumOfOddAndEvens[1], evenGroupSum);
{ }
int evenGroupSum = cg::reduce(subTile, elem, cg::plus<int>());
if (subTile.thread_rank() == 0)
{
// Add local reduction of even present in this group of evens.
atomicAdd(&sumOfOddAndEvens[1], evenGroupSum);
}
}
// reconverge warp so for next loop iteration we ensure convergence of
// above diverged threads to perform coalesced loads of inputArr.
cg::sync(tile32);
} }
// reconverge warp so for next loop iteration we ensure convergence of
// above diverged threads to perform coalesced loads of inputArr.
cg::sync(tile32);
}
} }
/** /**
* Host main routine * Host main routine
*/ */
int main(int argc, const char **argv) int main(int argc, const char **argv) {
{ int deviceId = findCudaDevice(argc, argv);
int deviceId = findCudaDevice(argc, argv); int *h_inputArr, *d_inputArr;
int *h_inputArr, *d_inputArr; int *h_numOfOdds, *d_numOfOdds;
int *h_numOfOdds, *d_numOfOdds; int *h_sumOfOddEvenElems, *d_sumOfOddEvenElems;
int *h_sumOfOddEvenElems, *d_sumOfOddEvenElems; unsigned int arrSize = 1024 * 100;
unsigned int arrSize = 1024 * 100;
h_inputArr = new int[arrSize]; checkCudaErrors(cudaMallocHost(&h_inputArr, sizeof(int) * arrSize));
h_numOfOdds = new int[1]; checkCudaErrors(cudaMallocHost(&h_numOfOdds, sizeof(int)));
h_sumOfOddEvenElems = new int[2]; checkCudaErrors(cudaMallocHost(&h_sumOfOddEvenElems, sizeof(int) * 2));
initOddEvenArr(h_inputArr, arrSize); initOddEvenArr(h_inputArr, arrSize);
cudaStream_t stream; cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaMalloc(&d_inputArr, sizeof(int)*arrSize)); checkCudaErrors(cudaMalloc(&d_inputArr, sizeof(int) * arrSize));
checkCudaErrors(cudaMalloc(&d_numOfOdds, sizeof(int))); checkCudaErrors(cudaMalloc(&d_numOfOdds, sizeof(int)));
checkCudaErrors(cudaMalloc(&d_sumOfOddEvenElems, sizeof(int)*2)); checkCudaErrors(cudaMalloc(&d_sumOfOddEvenElems, sizeof(int) * 2));
checkCudaErrors(cudaMemcpyAsync(d_inputArr, h_inputArr, sizeof(int)*arrSize, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_inputArr, h_inputArr, sizeof(int) * arrSize,
checkCudaErrors(cudaMemsetAsync(d_numOfOdds, 0, sizeof(int), stream)); cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemsetAsync(d_sumOfOddEvenElems, 0, 2*sizeof(int), stream)); checkCudaErrors(cudaMemsetAsync(d_numOfOdds, 0, sizeof(int), stream));
checkCudaErrors(
cudaMemsetAsync(d_sumOfOddEvenElems, 0, 2 * sizeof(int), stream));
//Launch the kernel // Launch the kernel
int threadsPerBlock=1024; int threadsPerBlock = 0;
int blocksPerGrid = arrSize / threadsPerBlock; int blocksPerGrid = 0;
checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
&blocksPerGrid, &threadsPerBlock, oddEvenCountAndSumCG, 0, 0));
printf("\nLaunching %d blocks with %d threads...\n\n",blocksPerGrid, threadsPerBlock); printf("\nLaunching %d blocks with %d threads...\n\n", blocksPerGrid,
threadsPerBlock);
oddEvenCountAndSumCG<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_inputArr, d_numOfOdds, d_sumOfOddEvenElems, arrSize); oddEvenCountAndSumCG<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
d_inputArr, d_numOfOdds, d_sumOfOddEvenElems, arrSize);
checkCudaErrors(cudaMemcpyAsync(h_numOfOdds, d_numOfOdds, sizeof(int), cudaMemcpyDeviceToHost, stream)); checkCudaErrors(cudaMemcpyAsync(h_numOfOdds, d_numOfOdds, sizeof(int),
checkCudaErrors(cudaMemcpyAsync(h_sumOfOddEvenElems, d_sumOfOddEvenElems, 2*sizeof(int), cudaMemcpyDeviceToHost, stream)); cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaMemcpyAsync(h_sumOfOddEvenElems, d_sumOfOddEvenElems,
2 * sizeof(int), cudaMemcpyDeviceToHost,
stream));
checkCudaErrors(cudaStreamSynchronize(stream));
printf("Array size = %d Num of Odds = %d Sum of Odds = %d Sum of Evens %d\n", arrSize, h_numOfOdds[0], h_sumOfOddEvenElems[0], h_sumOfOddEvenElems[1]); printf("Array size = %d Num of Odds = %d Sum of Odds = %d Sum of Evens %d\n",
printf("\n...Done.\n\n"); arrSize, h_numOfOdds[0], h_sumOfOddEvenElems[0],
h_sumOfOddEvenElems[1]);
printf("\n...Done.\n\n");
delete[] h_inputArr; checkCudaErrors(cudaFreeHost(h_inputArr));
delete[] h_numOfOdds; checkCudaErrors(cudaFreeHost(h_numOfOdds));
delete[] h_sumOfOddEvenElems; checkCudaErrors(cudaFreeHost(h_sumOfOddEvenElems));
checkCudaErrors(cudaFree(d_inputArr)); checkCudaErrors(cudaFree(d_inputArr));
checkCudaErrors(cudaFree(d_numOfOdds)); checkCudaErrors(cudaFree(d_numOfOdds));
checkCudaErrors(cudaFree(d_sumOfOddEvenElems)); checkCudaErrors(cudaFree(d_sumOfOddEvenElems));
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -118,6 +118,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -114,6 +114,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -118,6 +118,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -114,6 +114,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -265,6 +265,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1 SAMPLE_ENABLED := 1
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - conjugateGradientCudaGraphs is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS := ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))

View File

@ -30,7 +30,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch,
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -25,7 +25,6 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
/* /*
* This sample implements a conjugate gradient solver on GPU * This sample implements a conjugate gradient solver on GPU
* using CUBLAS and CUSPARSE with CUDA Graphs * using CUBLAS and CUSPARSE with CUDA Graphs
@ -46,7 +45,6 @@
#include <helper_cuda.h> // helper function CUDA error checking and initialization #include <helper_cuda.h> // helper function CUDA error checking and initialization
#include <helper_functions.h> // helper for shared functions common to CUDA Samples #include <helper_functions.h> // helper for shared functions common to CUDA Samples
const char *sSDKname = "conjugateGradientCudaGraphs"; const char *sSDKname = "conjugateGradientCudaGraphs";
#ifndef WITH_GRAPH #ifndef WITH_GRAPH
@ -145,12 +143,12 @@ int main(int argc, char **argv) {
/* Generate a random tridiagonal symmetric matrix in CSR format */ /* Generate a random tridiagonal symmetric matrix in CSR format */
N = 1048576; N = 1048576;
nz = (N - 2) * 3 + 4; nz = (N - 2) * 3 + 4;
I = (int *)malloc(sizeof(int) * (N + 1)); checkCudaErrors(cudaMallocHost(&I, sizeof(int) * (N + 1)));
J = (int *)malloc(sizeof(int) * nz); checkCudaErrors(cudaMallocHost(&J, sizeof(int) * nz));
val = (float *)malloc(sizeof(float) * nz); checkCudaErrors(cudaMallocHost(&val, sizeof(float) * nz));
genTridiag(I, J, val, N, nz); genTridiag(I, J, val, N, nz);
x = (float *)malloc(sizeof(float) * N); checkCudaErrors(cudaMallocHost(&x, sizeof(float) * N));
rhs = (float *)malloc(sizeof(float) * N); rhs = (float *)malloc(sizeof(float) * N);
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
@ -192,9 +190,9 @@ int main(int argc, char **argv) {
/* Wrap raw data into cuSPARSE generic API objects */ /* Wrap raw data into cuSPARSE generic API objects */
cusparseSpMatDescr_t matA = NULL; cusparseSpMatDescr_t matA = NULL;
checkCudaErrors(cusparseCreateCsr( checkCudaErrors(cusparseCreateCsr(&matA, N, N, nz, d_row, d_col, d_val,
&matA, N, N, nz, d_row, d_col, d_val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
cusparseDnVecDescr_t vecx = NULL; cusparseDnVecDescr_t vecx = NULL;
checkCudaErrors(cusparseCreateDnVec(&vecx, N, d_x, CUDA_R_32F)); checkCudaErrors(cusparseCreateDnVec(&vecx, N, d_x, CUDA_R_32F));
cusparseDnVecDescr_t vecp = NULL; cusparseDnVecDescr_t vecp = NULL;
@ -206,7 +204,7 @@ int main(int argc, char **argv) {
size_t bufferSize = 0; size_t bufferSize = 0;
checkCudaErrors(cusparseSpMV_bufferSize( checkCudaErrors(cusparseSpMV_bufferSize(
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx, cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx,
&beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize)); &beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
void *buffer = NULL; void *buffer = NULL;
checkCudaErrors(cudaMalloc(&buffer, bufferSize)); checkCudaErrors(cudaMalloc(&buffer, bufferSize));
@ -234,9 +232,9 @@ int main(int argc, char **argv) {
beta = 0.0; beta = 0.0;
checkCudaErrors(cusparseSetStream(cusparseHandle, stream1)); checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
checkCudaErrors(cusparseSpMV( checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecx, &alpha, matA, vecx, &beta, vecAx, CUDA_R_32F,
&beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); CUSPARSE_SPMV_ALG_DEFAULT, buffer));
checkCudaErrors(cublasSetStream(cublasHandle, stream1)); checkCudaErrors(cublasSetStream(cublasHandle, stream1));
checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1)); checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1));
@ -248,9 +246,9 @@ int main(int argc, char **argv) {
k = 1; k = 1;
// First Iteration when k=1 starts // First Iteration when k=1 starts
checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_p, 1)); checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_p, 1));
checkCudaErrors(cusparseSpMV( checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp, &alpha, matA, vecp, &beta, vecAx, CUDA_R_32F,
&beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); CUSPARSE_SPMV_ALG_DEFAULT, buffer));
checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
@ -290,9 +288,9 @@ int main(int argc, char **argv) {
checkCudaErrors( checkCudaErrors(
cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST)); cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST));
checkCudaErrors(cusparseSpMV( checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp, &alpha, matA, vecp, &beta, vecAx, CUDA_R_32F,
&beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); CUSPARSE_SPMV_ALG_DEFAULT, buffer));
checkCudaErrors(cudaMemsetAsync(d_dot, 0, sizeof(float), stream1)); checkCudaErrors(cudaMemsetAsync(d_dot, 0, sizeof(float), stream1));
checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
@ -335,8 +333,8 @@ int main(int argc, char **argv) {
checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1)); checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1));
checkCudaErrors(cusparseSpMV( checkCudaErrors(cusparseSpMV(
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp, cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecp,
&beta, vecAx, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &buffer)); &beta, vecAx, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, buffer));
cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE); cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot)); checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
@ -395,23 +393,31 @@ int main(int argc, char **argv) {
cusparseDestroy(cusparseHandle); cusparseDestroy(cusparseHandle);
cublasDestroy(cublasHandle); cublasDestroy(cublasHandle);
if (matA ) { checkCudaErrors(cusparseDestroySpMat(matA)); } if (matA) {
if (vecx ) { checkCudaErrors(cusparseDestroyDnVec(vecx)); } checkCudaErrors(cusparseDestroySpMat(matA));
if (vecAx ) { checkCudaErrors(cusparseDestroyDnVec(vecAx)); } }
if (vecp ) { checkCudaErrors(cusparseDestroyDnVec(vecp)); } if (vecx) {
checkCudaErrors(cusparseDestroyDnVec(vecx));
}
if (vecAx) {
checkCudaErrors(cusparseDestroyDnVec(vecAx));
}
if (vecp) {
checkCudaErrors(cusparseDestroyDnVec(vecp));
}
free(I); checkCudaErrors(cudaFreeHost(I));
free(J); checkCudaErrors(cudaFreeHost(J));
free(val); checkCudaErrors(cudaFreeHost(val));
free(x); checkCudaErrors(cudaFreeHost(x));
free(rhs); free(rhs);
cudaFree(d_col); checkCudaErrors(cudaFree(d_col));
cudaFree(d_row); checkCudaErrors(cudaFree(d_row));
cudaFree(d_val); checkCudaErrors(cudaFree(d_val));
cudaFree(d_x); checkCudaErrors(cudaFree(d_x));
cudaFree(d_r); checkCudaErrors(cudaFree(d_r));
cudaFree(d_p); checkCudaErrors(cudaFree(d_p));
cudaFree(d_Ax); checkCudaErrors(cudaFree(d_Ax));
printf("Test Summary: Error amount = %f\n", err); printf("Test Summary: Error amount = %f\n", err);
exit((k <= max_iter) ? 0 : 1); exit((k <= max_iter) ? 0 : 1);

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -30,7 +30,7 @@ cudaMemAdvise, cudaMemPrefetchAsync, cudaLaunchCooperativeKernelMultiDevice, cud
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -223,8 +223,10 @@ __device__ void gpuDotProduct(float *vecA, float *vecB, int size,
cg::sync(cta); cg::sync(cta);
if (tile32.meta_group_rank() == 0) { if (tile32.meta_group_rank() == 0) {
temp_sum = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0; temp_sum = tile32.thread_rank() < tile32.meta_group_size()
temp_sum = cg::reduce(tile32, temp_sum, cg::plus<double>()); ? tmp[tile32.thread_rank()]
: 0.0;
temp_sum = cg::reduce(tile32, temp_sum, cg::plus<double>());
if (tile32.thread_rank() == 0) { if (tile32.thread_rank() == 0) {
atomicAdd(&grid_dot_result, temp_sum); atomicAdd(&grid_dot_result, temp_sum);
@ -239,8 +241,9 @@ __device__ void gpuCopyVector(float *srcA, float *destB, int size,
} }
} }
__device__ void gpuScaleVectorAndSaxpy(float *x, float *y, float a, float scale, int size, __device__ void gpuScaleVectorAndSaxpy(float *x, float *y, float a, float scale,
const cg::multi_grid_group &multi_grid) { int size,
const cg::multi_grid_group &multi_grid) {
for (int i = multi_grid.thread_rank(); i < size; i += multi_grid.size()) { for (int i = multi_grid.thread_rank(); i < size; i += multi_grid.size()) {
y[i] = a * x[i] + scale * y[i]; y[i] = a * x[i] + scale * y[i];
} }
@ -360,10 +363,11 @@ std::multimap<std::pair<int, int>, int> getIdenticalGPUs() {
// Filter unsupported devices // Filter unsupported devices
if (deviceProp.cooperativeMultiDeviceLaunch && if (deviceProp.cooperativeMultiDeviceLaunch &&
deviceProp.concurrentManagedAccess) { deviceProp.concurrentManagedAccess) {
identicalGpus.emplace(std::make_pair(deviceProp.major, deviceProp.minor), i); identicalGpus.emplace(std::make_pair(deviceProp.major, deviceProp.minor),
i);
} }
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i, printf("GPU Device %d: \"%s\" with compute capability %d.%d\n", i,
deviceProp.name, deviceProp.major, deviceProp.minor); deviceProp.name, deviceProp.major, deviceProp.minor);
} }
return identicalGpus; return identicalGpus;
@ -387,15 +391,17 @@ int main(int argc, char **argv) {
auto bestFit = std::make_pair(it, it); auto bestFit = std::make_pair(it, it);
// use std::distance to find the largest number of GPUs amongst architectures // use std::distance to find the largest number of GPUs amongst architectures
auto distance = [](decltype(bestFit) p){return std::distance(p.first, p.second);}; auto distance = [](decltype(bestFit) p) {
return std::distance(p.first, p.second);
};
// Read each unique key/pair element in order // Read each unique key/pair element in order
for (; it != end; it = gpusByArch.upper_bound(it->first)) { for (; it != end; it = gpusByArch.upper_bound(it->first)) {
// first and second are iterators bounded within the architecture group // first and second are iterators bounded within the architecture group
auto testFit = gpusByArch.equal_range(it->first); auto testFit = gpusByArch.equal_range(it->first);
// Always use devices with highest architecture version or whichever has the most devices available // Always use devices with highest architecture version or whichever has the
if (distance(bestFit) <= distance(testFit)) // most devices available
bestFit = testFit; if (distance(bestFit) <= distance(testFit)) bestFit = testFit;
} }
if (distance(bestFit) < kNumGpusRequired) { if (distance(bestFit) < kNumGpusRequired) {
@ -408,33 +414,35 @@ int main(int argc, char **argv) {
std::set<int> bestFitDeviceIds; std::set<int> bestFitDeviceIds;
// check & select peer-to-peer access capable GPU devices as enabling p2p access between participating // check & select peer-to-peer access capable GPU devices as enabling p2p
// access between participating
// GPUs gives better performance for multi_grid sync. // GPUs gives better performance for multi_grid sync.
for (auto itr = bestFit.first; itr != bestFit.second; itr++) { for (auto itr = bestFit.first; itr != bestFit.second; itr++) {
int deviceId = itr->second; int deviceId = itr->second;
checkCudaErrors(cudaSetDevice(deviceId)); checkCudaErrors(cudaSetDevice(deviceId));
std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds](decltype(*itr) mapPair) { std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds,
if (deviceId != mapPair.second) &kNumGpusRequired](
{ decltype(*itr) mapPair) {
if (deviceId != mapPair.second) {
int access = 0; int access = 0;
checkCudaErrors(cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second)); checkCudaErrors(
printf("Device=%d %s Access Peer Device=%d\n", deviceId, access ? "CAN" : "CANNOT", mapPair.second); cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second));
printf("Device=%d %s Access Peer Device=%d\n", deviceId,
access ? "CAN" : "CANNOT", mapPair.second);
if (access && bestFitDeviceIds.size() < kNumGpusRequired) { if (access && bestFitDeviceIds.size() < kNumGpusRequired) {
bestFitDeviceIds.emplace(deviceId); bestFitDeviceIds.emplace(deviceId);
bestFitDeviceIds.emplace(mapPair.second); bestFitDeviceIds.emplace(mapPair.second);
} } else {
else {
printf("Ignoring device %i (max devices exceeded)\n", mapPair.second); printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);
} }
} }
}); });
if (bestFitDeviceIds.size() >= kNumGpusRequired) if (bestFitDeviceIds.size() >= kNumGpusRequired) {
{
printf("Selected p2p capable devices - "); printf("Selected p2p capable devices - ");
for (auto devicesItr = bestFitDeviceIds.begin(); devicesItr != bestFitDeviceIds.end(); devicesItr++) for (auto devicesItr = bestFitDeviceIds.begin();
{ devicesItr != bestFitDeviceIds.end(); devicesItr++) {
printf("deviceId = %d ", *devicesItr); printf("deviceId = %d ", *devicesItr);
} }
printf("\n"); printf("\n");
@ -442,33 +450,34 @@ int main(int argc, char **argv) {
} }
} }
// if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p capable, // if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p
// capable,
// hence we add it without p2p capability check. // hence we add it without p2p capability check.
if (!bestFitDeviceIds.size()) if (!bestFitDeviceIds.size()) {
{ printf("Devices involved are not p2p capable.. selecting %zu of them\n",
printf("Devices involved are not p2p capable.. selecting %zu of them\n", kNumGpusRequired); kNumGpusRequired);
std::for_each(bestFit.first, bestFit.second, [&bestFitDeviceIds](decltype(*bestFit.first) mapPair) { std::for_each(bestFit.first, bestFit.second,
if (bestFitDeviceIds.size() < kNumGpusRequired) { [&bestFitDeviceIds,
bestFitDeviceIds.emplace(mapPair.second); &kNumGpusRequired](decltype(*bestFit.first) mapPair) {
} if (bestFitDeviceIds.size() < kNumGpusRequired) {
else { bestFitDeviceIds.emplace(mapPair.second);
printf("Ignoring device %i (max devices exceeded)\n", mapPair.second); } else {
} printf("Ignoring device %i (max devices exceeded)\n",
// Insert the sequence into the deviceIds set mapPair.second);
}); }
} // Insert the sequence into the deviceIds set
else });
{ } else {
// perform cudaDeviceEnablePeerAccess in both directions for all participating devices // perform cudaDeviceEnablePeerAccess in both directions for all
// of a cudaLaunchCooperativeKernelMultiDevice call this gives better performance for multi_grid sync. // participating devices of a cudaLaunchCooperativeKernelMultiDevice call
for (auto p1_itr = bestFitDeviceIds.begin(); p1_itr != bestFitDeviceIds.end(); p1_itr++) // this gives better performance for multi_grid sync.
{ for (auto p1_itr = bestFitDeviceIds.begin();
p1_itr != bestFitDeviceIds.end(); p1_itr++) {
checkCudaErrors(cudaSetDevice(*p1_itr)); checkCudaErrors(cudaSetDevice(*p1_itr));
for (auto p2_itr = bestFitDeviceIds.begin(); p2_itr != bestFitDeviceIds.end(); p2_itr++) for (auto p2_itr = bestFitDeviceIds.begin();
{ p2_itr != bestFitDeviceIds.end(); p2_itr++) {
if (*p1_itr != *p2_itr) if (*p1_itr != *p2_itr) {
{ checkCudaErrors(cudaDeviceEnablePeerAccess(*p2_itr, 0));
checkCudaErrors(cudaDeviceEnablePeerAccess(*p2_itr, 0 ));
checkCudaErrors(cudaSetDevice(*p1_itr)); checkCudaErrors(cudaSetDevice(*p1_itr));
} }
} }
@ -518,7 +527,7 @@ int main(int argc, char **argv) {
std::cout << "\nRunning on GPUs = " << kNumGpusRequired << std::endl; std::cout << "\nRunning on GPUs = " << kNumGpusRequired << std::endl;
cudaStream_t nStreams[kNumGpusRequired]; cudaStream_t nStreams[kNumGpusRequired];
int sMemSize = sizeof(double) * ((THREADS_PER_BLOCK/32) + 1); int sMemSize = sizeof(double) * ((THREADS_PER_BLOCK / 32) + 1);
int numBlocksPerSm = INT_MAX; int numBlocksPerSm = INT_MAX;
int numThreads = THREADS_PER_BLOCK; int numThreads = THREADS_PER_BLOCK;
int numSms = INT_MAX; int numSms = INT_MAX;
@ -530,17 +539,16 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaSetDevice(*deviceId)); checkCudaErrors(cudaSetDevice(*deviceId));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *deviceId)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *deviceId));
int numBlocksPerSm_current=0; int numBlocksPerSm_current = 0;
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm_current, multiGpuConjugateGradient, numThreads, sMemSize)); &numBlocksPerSm_current, multiGpuConjugateGradient, numThreads,
sMemSize));
if (numBlocksPerSm > numBlocksPerSm_current) if (numBlocksPerSm > numBlocksPerSm_current) {
{ numBlocksPerSm = numBlocksPerSm_current;
numBlocksPerSm = numBlocksPerSm_current;
} }
if (numSms > deviceProp.multiProcessorCount) if (numSms > deviceProp.multiProcessorCount) {
{ numSms = deviceProp.multiProcessorCount;
numSms = deviceProp.multiProcessorCount;
} }
deviceId++; deviceId++;
} }
@ -554,7 +562,7 @@ int main(int argc, char **argv) {
int device_count = 0; int device_count = 0;
int totalThreadsPerGPU = numSms * numBlocksPerSm * THREADS_PER_BLOCK; int totalThreadsPerGPU = numSms * numBlocksPerSm * THREADS_PER_BLOCK;
deviceId = bestFitDeviceIds.begin();; deviceId = bestFitDeviceIds.begin();
while (deviceId != bestFitDeviceIds.end()) { while (deviceId != bestFitDeviceIds.end()) {
checkCudaErrors(cudaSetDevice(*deviceId)); checkCudaErrors(cudaSetDevice(*deviceId));
checkCudaErrors(cudaStreamCreate(&nStreams[device_count])); checkCudaErrors(cudaStreamCreate(&nStreams[device_count]));
@ -621,14 +629,15 @@ int main(int argc, char **argv) {
printf("Total threads per GPU = %d numBlocksPerSm = %d\n", printf("Total threads per GPU = %d numBlocksPerSm = %d\n",
numSms * numBlocksPerSm * THREADS_PER_BLOCK, numBlocksPerSm); numSms * numBlocksPerSm * THREADS_PER_BLOCK, numBlocksPerSm);
dim3 dimGrid(numSms * numBlocksPerSm, 1, 1), dimBlock(THREADS_PER_BLOCK, 1, 1); dim3 dimGrid(numSms * numBlocksPerSm, 1, 1),
dimBlock(THREADS_PER_BLOCK, 1, 1);
void *kernelArgs[] = { void *kernelArgs[] = {
(void *)&I, (void *)&J, (void *)&val, (void *)&x, (void *)&I, (void *)&J, (void *)&val, (void *)&x,
(void *)&Ax, (void *)&p, (void *)&r, (void *)&dot_result, (void *)&Ax, (void *)&p, (void *)&r, (void *)&dot_result,
(void *)&nz, (void *)&N, (void *)&tol, (void *)&nz, (void *)&N, (void *)&tol,
}; };
cudaLaunchParams *launchParamsList = (cudaLaunchParams *)malloc( cudaLaunchParams *launchParamsList =
sizeof(cudaLaunchParams) * kNumGpusRequired); (cudaLaunchParams *)malloc(sizeof(cudaLaunchParams) * kNumGpusRequired);
for (int i = 0; i < kNumGpusRequired; i++) { for (int i = 0; i < kNumGpusRequired; i++) {
launchParamsList[i].func = (void *)multiGpuConjugateGradient; launchParamsList[i].func = (void *)multiGpuConjugateGradient;
launchParamsList[i].gridDim = dimGrid; launchParamsList[i].gridDim = dimGrid;
@ -645,12 +654,11 @@ int main(int argc, char **argv) {
cudaCooperativeLaunchMultiDeviceNoPreSync | cudaCooperativeLaunchMultiDeviceNoPreSync |
cudaCooperativeLaunchMultiDeviceNoPostSync)); cudaCooperativeLaunchMultiDeviceNoPostSync));
checkCudaErrors(cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId));
checkCudaErrors( checkCudaErrors(
cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId)); cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId));
checkCudaErrors(
cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId));
deviceId = bestFitDeviceIds.begin();; deviceId = bestFitDeviceIds.begin();
device_count = 0; device_count = 0;
while (deviceId != bestFitDeviceIds.end()) { while (deviceId != bestFitDeviceIds.end()) {
checkCudaErrors(cudaSetDevice(*deviceId)); checkCudaErrors(cudaSetDevice(*deviceId));
@ -658,7 +666,7 @@ int main(int argc, char **argv) {
deviceId++; deviceId++;
} }
r1 = *dot_result; r1 = (float)*dot_result;
printf("GPU Final, residual = %e \n ", sqrt(r1)); printf("GPU Final, residual = %e \n ", sqrt(r1));

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -109,6 +109,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -105,6 +105,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -271,6 +271,12 @@ ifeq ($(TARGET_ARCH),armv7l)
SAMPLE_ENABLED := 0 SAMPLE_ENABLED := 0
endif endif
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - cuSolverDn_LinearSolver is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(TARGET_OS),linux) ifeq ($(TARGET_OS),linux)
ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\" ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\"
endif endif

View File

@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -110,6 +110,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -265,6 +265,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1 SAMPLE_ENABLED := 1
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - cuSolverSp_LinearSolver is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(TARGET_OS),linux) ifeq ($(TARGET_OS),linux)
ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\" ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\"
endif endif

View File

@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -495,13 +495,13 @@ int main(int argc, char *argv[]) {
size_t bufferSize = 0; size_t bufferSize = 0;
checkCudaErrors(cusparseSpMV_bufferSize( checkCudaErrors(cusparseSpMV_bufferSize(
cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx, cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,
&one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize)); &one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
void *buffer = NULL; void *buffer = NULL;
checkCudaErrors(cudaMalloc(&buffer, bufferSize)); checkCudaErrors(cudaMalloc(&buffer, bufferSize));
checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&minus_one, matA, vecx, &one, vecAx, CUDA_R_64F, &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F,
CUSPARSE_MV_ALG_DEFAULT, &buffer)); CUSPARSE_SPMV_ALG_DEFAULT, buffer));
checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA, checkCudaErrors(cudaMemcpyAsync(h_r, d_r, sizeof(double) * rowsA,
cudaMemcpyDeviceToHost, stream)); cudaMemcpyDeviceToHost, stream));
@ -559,7 +559,7 @@ int main(int argc, char *argv[]) {
checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, checkCudaErrors(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&minus_one, matA, vecx, &one, vecAx, CUDA_R_64F, &minus_one, matA, vecx, &one, vecAx, CUDA_R_64F,
CUSPARSE_MV_ALG_DEFAULT, &buffer)); CUSPARSE_SPMV_ALG_DEFAULT, buffer));
checkCudaErrors(cudaMemcpyAsync(h_x, d_x, sizeof(double) * colsA, checkCudaErrors(cudaMemcpyAsync(h_x, d_x, sizeof(double) * colsA,
cudaMemcpyDeviceToHost, stream)); cudaMemcpyDeviceToHost, stream));

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -110,6 +110,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -30,7 +30,7 @@ cudaMalloc, cudaFree
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -109,6 +109,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -105,6 +105,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -279,6 +279,12 @@ ifeq ($(TARGET_ARCH),armv7l)
SAMPLE_ENABLED := 0 SAMPLE_ENABLED := 0
endif endif
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - cudaNvSci is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS := ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS) ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))

View File

@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaExternalMemoryG
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaExternalMemoryG
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpy
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -109,6 +109,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -105,6 +105,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -31,7 +31,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -27,7 +27,7 @@ cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -112,10 +112,10 @@ int main(int argc, char **argv) {
char msg[256]; char msg[256];
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(msg, sizeof(msg), sprintf_s(msg, sizeof(msg),
" Total amount of global memory: %.0f MBytes " " Total amount of global memory: %.0f MBytes "
"(%llu bytes)\n", "(%llu bytes)\n",
static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f), static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
(unsigned long long)deviceProp.totalGlobalMem); (unsigned long long)deviceProp.totalGlobalMem);
#else #else
snprintf(msg, sizeof(msg), snprintf(msg, sizeof(msg),
" Total amount of global memory: %.0f MBytes " " Total amount of global memory: %.0f MBytes "
@ -125,7 +125,7 @@ int main(int argc, char **argv) {
#endif #endif
printf("%s", msg); printf("%s", msg);
printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", printf(" (%03d) Multiprocessors, (%03d) CUDA Cores/MP: %d CUDA Cores\n",
deviceProp.multiProcessorCount, deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
@ -250,8 +250,7 @@ int main(int argc, char **argv) {
"device)", "device)",
"Exclusive Process (many threads in one process is able to use " "Exclusive Process (many threads in one process is able to use "
"::cudaSetDevice() with this device)", "::cudaSetDevice() with this device)",
"Unknown", "Unknown", NULL};
NULL};
printf(" Compute Mode:\n"); printf(" Compute Mode:\n");
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
} }
@ -272,7 +271,7 @@ int main(int argc, char **argv) {
// must be enabled to support this // must be enabled to support this
&& prop[i].tccDriver && prop[i].tccDriver
#endif #endif
) { ) {
// This is an array of P2P capable GPUs // This is an array of P2P capable GPUs
gpuid[gpu_p2p_count++] = i; gpuid[gpu_p2p_count++] = i;
} }
@ -307,7 +306,8 @@ int main(int argc, char **argv) {
// driver version // driver version
sProfileString += ", CUDA Driver Version = "; sProfileString += ", CUDA Driver Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000,
(driverVersion % 100) / 10);
#else #else
snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
(driverVersion % 100) / 10); (driverVersion % 100) / 10);
@ -317,7 +317,8 @@ int main(int argc, char **argv) {
// Runtime version // Runtime version
sProfileString += ", CUDA Runtime Version = "; sProfileString += ", CUDA Runtime Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000,
(runtimeVersion % 100) / 10);
#else #else
snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
(runtimeVersion % 100) / 10); (runtimeVersion % 100) / 10);

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -30,7 +30,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -27,7 +27,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -25,7 +25,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch,
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -109,6 +109,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -105,6 +105,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -100,8 +100,10 @@ int main(int argc, char **argv) {
double *b = NULL; double *b = NULL;
float *A = NULL; float *A = NULL;
b = (double *)calloc(N_ROWS, sizeof(double)); checkCudaErrors(cudaMallocHost(&b, N_ROWS * sizeof(double)));
A = (float *)calloc(N_ROWS * N_ROWS, sizeof(float)); memset(b, 0, N_ROWS * sizeof(double));
checkCudaErrors(cudaMallocHost(&A, N_ROWS * N_ROWS * sizeof(float)));
memset(A, 0, N_ROWS * N_ROWS * sizeof(float));
createLinearSystem(A, b); createLinearSystem(A, b);
double *x = NULL; double *x = NULL;
@ -170,6 +172,9 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaFree(d_x)); checkCudaErrors(cudaFree(d_x));
checkCudaErrors(cudaFree(d_x_new)); checkCudaErrors(cudaFree(d_x_new));
checkCudaErrors(cudaFreeHost(A));
checkCudaErrors(cudaFreeHost(b));
printf("&&&& jacobiCudaGraphs %s\n", printf("&&&& jacobiCudaGraphs %s\n",
(fabs(sum - sumGPU) < conv_threshold) ? "PASSED" : "FAILED"); (fabs(sum - sumGPU) < conv_threshold) ? "PASSED" : "FAILED");

View File

@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -112,6 +112,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -108,6 +108,6 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -302,14 +302,10 @@ LIBRARIES :=
################################################################################ ################################################################################
FATBIN_FILE := memMapIpc_kernel${TARGET_SIZE}.fatbin PTX_FILE := memMapIpc_kernel${TARGET_SIZE}.ptx
# Gencode arguments # Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64)) SMS ?=
SMS ?= 35 37 50 52 60 61 70 72 75 80 86
else
SMS ?= 35 37 50 52 60 61 70 75 80 86
endif
ifeq ($(GENCODE_FLAGS),) ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS) # Generate SASS code for each SM architecture listed in $(SMS)
@ -395,7 +391,7 @@ endif
# Target rules # Target rules
all: build all: build
build: memMapIPCDrv $(FATBIN_FILE) build: memMapIPCDrv $(PTX_FILE)
check.deps: check.deps:
ifeq ($(SAMPLE_ENABLED),0) ifeq ($(SAMPLE_ENABLED),0)
@ -404,8 +400,8 @@ else
@echo "Sample is ready - all dependencies have been met" @echo "Sample is ready - all dependencies have been met"
endif endif
$(FATBIN_FILE): memMapIpc_kernel.cu $(PTX_FILE): memMapIpc_kernel.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -fatbin $< $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -ptx $<
$(EXEC) mkdir -p data $(EXEC) mkdir -p data
$(EXEC) cp -f $@ ./data $(EXEC) cp -f $@ ./data
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
@ -426,9 +422,8 @@ run: build
$(EXEC) ./memMapIPCDrv $(EXEC) ./memMapIPCDrv
clean: clean:
rm -f memMapIPCDrv helper_multiprocess.o memMapIpc.o data/$(FATBIN_FILE) $(FATBIN_FILE) rm -f memMapIPCDrv helper_multiprocess.o memMapIpc.o data/$(PTX_FILE) $(PTX_FILE)
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/memMapIPCDrv rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/memMapIPCDrv
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/$(PTX_FILE)
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/$(FATBIN_FILE)
clobber: clean clobber: clean

View File

@ -30,7 +30,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuLaunchKernel, cuMemcpyD
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 11.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run ## Build and Run

View File

@ -38,7 +38,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -67,7 +67,7 @@
<OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile> <OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
</Link> </Link>
<CudaCompile> <CudaCompile>
<CodeGeneration>compute_35,compute_35;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration> <CodeGeneration>compute_35,compute_35;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include> <Include>./;../../Common</Include>
<Defines>WIN32</Defines> <Defines>WIN32</Defines>
@ -105,14 +105,14 @@
<ItemGroup> <ItemGroup>
<ClCompile Include="memMapIpc.cpp" /> <ClCompile Include="memMapIpc.cpp" />
<CudaCompile Include="memMapIpc_kernel.cu"> <CudaCompile Include="memMapIpc_kernel.cu">
<CompileOut Condition="'$(Platform)'=='x64'">data/%(Filename)64.fatbin</CompileOut> <CompileOut Condition="'$(Platform)'=='x64'">data/%(Filename)64.ptx</CompileOut>
<NvccCompilation>fatbin</NvccCompilation> <NvccCompilation>ptx</NvccCompilation>
</CudaCompile> </CudaCompile>
<ClCompile Include="../../Common/helper_multiprocess.cpp" /> <ClCompile Include="../../Common/helper_multiprocess.cpp" />
<ClInclude Include="../../Common/helper_multiprocess.h" /> <ClInclude Include="../../Common/helper_multiprocess.h" />
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.props" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="PropertySheets"> <ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" /> <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -63,7 +63,7 @@
<OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile> <OutputFile>$(OutDir)/memMapIPCDrv.exe</OutputFile>
</Link> </Link>
<CudaCompile> <CudaCompile>
<CodeGeneration>compute_35,compute_35;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration> <CodeGeneration>compute_35,compute_35;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include> <Include>./;../../Common</Include>
<Defines>WIN32</Defines> <Defines>WIN32</Defines>
@ -101,14 +101,14 @@
<ItemGroup> <ItemGroup>
<ClCompile Include="memMapIpc.cpp" /> <ClCompile Include="memMapIpc.cpp" />
<CudaCompile Include="memMapIpc_kernel.cu"> <CudaCompile Include="memMapIpc_kernel.cu">
<CompileOut Condition="'$(Platform)'=='x64'">data/%(Filename)64.fatbin</CompileOut> <CompileOut Condition="'$(Platform)'=='x64'">data/%(Filename)64.ptx</CompileOut>
<NvccCompilation>fatbin</NvccCompilation> <NvccCompilation>ptx</NvccCompilation>
</CudaCompile> </CudaCompile>
<ClCompile Include="../../Common/helper_multiprocess.cpp" /> <ClCompile Include="../../Common/helper_multiprocess.cpp" />
<ClInclude Include="../../Common/helper_multiprocess.h" /> <ClInclude Include="../../Common/helper_multiprocess.h" />
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.2.targets" /> <Import Project="$(CUDAPropsPath)\CUDA 11.3.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -64,9 +64,13 @@ typedef struct shmStruct_st {
int sense; int sense;
} shmStruct; } shmStruct;
// define input fatbin file bool findModulePath(const char *, string &, char **, string &);
#ifndef FATBIN_FILE
#define FATBIN_FILE "memMapIpc_kernel64.fatbin" // define input ptx file for different platforms
#if defined(_WIN64) || defined(__LP64__)
#define PTX_FILE "memMapIpc_kernel64.ptx"
#else
#define PTX_FILE "memMapIpc_kernel32.ptx"
#endif #endif
// `ipcHandleTypeFlag` specifies the platform specific handle type this sample // `ipcHandleTypeFlag` specifies the platform specific handle type this sample
@ -255,23 +259,44 @@ static void memMapUnmapAndFreeMemory(CUdeviceptr dptr, size_t size) {
static void memMapGetDeviceFunction(char **argv) { static void memMapGetDeviceFunction(char **argv) {
// first search for the module path before we load the results // first search for the module path before we load the results
string module_path; string module_path, ptx_source;
std::ostringstream fatbin; if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) {
if (!findModulePath("memMapIpc_kernel.cubin", module_path, argv,
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { ptx_source)) {
exit(EXIT_FAILURE); printf(
"> findModulePath could not find <simpleMemMapIpc> ptx or cubin\n");
exit(EXIT_FAILURE);
}
} else { } else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str()); printf("> initCUDA loading module: <%s>\n", module_path.c_str());
} }
if (!fatbin.str().size()) { // Create module from binary file (PTX or CUBIN)
printf("fatbin file empty. exiting..\n"); if (module_path.rfind("ptx") != string::npos) {
exit(EXIT_FAILURE); // in this branch we use compilation with parameters
const unsigned int jitNumOptions = 3;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions];
// set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024;
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
// set up pointer to the compilation log buffer
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[1] = jitLogBuffer;
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32;
jitOptVals[2] = (void *)(size_t)jitRegCount;
checkCudaErrors(cuModuleLoadDataEx(&cuModule, ptx_source.c_str(),
jitNumOptions, jitOptions,
(void **)jitOptVals));
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
} else {
checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str()));
} }
// Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// Get function handle from module // Get function handle from module
checkCudaErrors( checkCudaErrors(
cuModuleGetFunction(&_memMapIpc_kernel, cuModule, "memMapIpc_kernel")); cuModuleGetFunction(&_memMapIpc_kernel, cuModule, "memMapIpc_kernel"));
@ -585,3 +610,37 @@ int main(int argc, char **argv) {
return EXIT_SUCCESS; return EXIT_SUCCESS;
#endif #endif
} }
bool inline findModulePath(const char *module_file, string &module_path,
char **argv, string &ptx_source) {
char *actual_path = sdkFindFilePath(module_file, argv[0]);
if (actual_path) {
module_path = actual_path;
} else {
printf("> findModulePath file not found: <%s> \n", module_file);
return false;
}
if (module_path.empty()) {
printf("> findModulePath could not find file: <%s> \n", module_file);
return false;
} else {
printf("> findModulePath found file at <%s>\n", module_path.c_str());
if (module_path.rfind(".ptx") != string::npos) {
FILE *fp = fopen(module_path.c_str(), "rb");
fseek(fp, 0, SEEK_END);
int file_size = ftell(fp);
char *buf = new char[file_size + 1];
fseek(fp, 0, SEEK_SET);
fread(buf, sizeof(char), file_size, fp);
fclose(fp);
buf[file_size] = '\0';
ptx_source = buf;
delete[] buf;
}
return true;
}
}

Some files were not shown because too many files have changed in this diff Show More