diff --git a/README.md b/README.md index 2650266e..e7726a99 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ This section describes the release notes for the CUDA Samples on GitHub only. * Added warp aggregated atomic multi bucket increments kernel using labeled_partition cooperative groups in `warpAggregatedAtomicsCG` which can be used on compute capability 7.0 and above GPU architectures. * Added `binaryPartitionCG`. Demonstrates binary partition cooperative groups and reduction within the thread block. * Added two new reduction kernels in `reduction` one which demonstrates reduce_add_sync intrinstic supported on compute capability 8.0 and another which uses cooperative_groups::reduce function which does thread_block_tile level reduction introduced from CUDA 11.0. +* Added `cudaCompressibleMemory`. Demonstrates compressible memory allocation using cuMemMap API. * Added `simpleVulkanMMAP`. Demonstrates Vulkan CUDA Interop via cuMemMap APIs. * Added `concurrentKernels`. Demonstrates the use of CUDA streams for concurrent execution of several kernels on a GPU. * Dropped Mac OSX support from all samples. @@ -146,13 +147,13 @@ The samples makefiles can take advantage of certain options: **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[nvJPEG](./Samples/nvJPEG)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[cudaNvSci](./Samples/cudaNvSci)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | -**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | -**[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | -**[reduction](./Samples/reduction)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | -**[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[concurrentKernels](./Samples/concurrentKernels)** | -**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | -**[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | -**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | +**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | +**[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | +**[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | +**[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | +**[concurrentKernels](./Samples/concurrentKernels)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | +**[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | +**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | #### Windows **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | @@ -161,13 +162,13 @@ The samples makefiles can take advantage of certain options: **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[nvJPEG](./Samples/nvJPEG)** | **[simpleD3D12](./Samples/simpleD3D12)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | -**[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | -**[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | -**[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[simpleD3D11](./Samples/simpleD3D11)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | -**[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[concurrentKernels](./Samples/concurrentKernels)** | -**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | -**[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | -**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[matrixMul](./Samples/matrixMul)** | +**[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | +**[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | +**[reduction](./Samples/reduction)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[simpleD3D11](./Samples/simpleD3D11)** | +**[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | +**[concurrentKernels](./Samples/concurrentKernels)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | +**[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | +**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[matrixMul](./Samples/matrixMul)** | ## Dependencies diff --git a/Samples/cudaCompressibleMemory/Makefile b/Samples/cudaCompressibleMemory/Makefile new file mode 100644 index 00000000..d29b16e4 --- /dev/null +++ b/Samples/cudaCompressibleMemory/Makefile @@ -0,0 +1,413 @@ +################################################################################ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +############################## +# start deprecated interface # +############################## +ifeq ($(x86_64),1) + $(info WARNING - x86_64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=x86_64 instead) + TARGET_ARCH ?= x86_64 +endif +ifeq ($(ARMv7),1) + $(info WARNING - ARMv7 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=armv7l instead) + TARGET_ARCH ?= armv7l +endif +ifeq ($(aarch64),1) + $(info WARNING - aarch64 variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=aarch64 instead) + TARGET_ARCH ?= aarch64 +endif +ifeq ($(ppc64le),1) + $(info WARNING - ppc64le variable has been deprecated) + $(info WARNING - please use TARGET_ARCH=ppc64le instead) + TARGET_ARCH ?= ppc64le +endif +ifneq ($(GCC),) + $(info WARNING - GCC variable has been deprecated) + $(info WARNING - please use HOST_COMPILER=$(GCC) instead) + HOST_COMPILER ?= $(GCC) +endif +ifneq ($(abi),) + $(error ERROR - abi variable has been removed) +endif +############################ +# end deprecated interface # +############################ + +# architecture +HOST_ARCH := $(shell uname -m) +TARGET_ARCH ?= $(HOST_ARCH) +ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l)) + ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le)) + TARGET_SIZE := 64 + else ifneq (,$(filter $(TARGET_ARCH),armv7l)) + TARGET_SIZE := 32 + endif + else + TARGET_SIZE := $(shell getconf LONG_BIT) + endif +else + $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) +endif + +# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now. +ifeq ($(HOST_ARCH),aarch64) + ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux)) + HOST_ARCH := sbsa + TARGET_ARCH := sbsa + endif +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le)) + $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) + endif +endif + +# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l +ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) + TARGET_ARCH = armv7l +endif + +# operating system +HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") +TARGET_OS ?= $(HOST_OS) +ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) + $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) +endif + +# host compiler +ifeq ($(TARGET_OS),darwin) + ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) + HOST_COMPILER ?= clang++ + endif +else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) + ifeq ($(TARGET_OS),linux) + HOST_COMPILER ?= arm-linux-gnueabihf-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ + else ifeq ($(TARGET_OS),android) + HOST_COMPILER ?= arm-linux-androideabi-g++ + endif + else ifeq ($(TARGET_ARCH),aarch64) + ifeq ($(TARGET_OS), linux) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_OS),qnx) + ifeq ($(QNX_HOST),) + $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) + endif + ifeq ($(QNX_TARGET),) + $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) + endif + export QNX_HOST + export QNX_TARGET + HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ + else ifeq ($(TARGET_OS), android) + HOST_COMPILER ?= aarch64-linux-android-clang++ + endif + else ifeq ($(TARGET_ARCH),sbsa) + HOST_COMPILER ?= aarch64-linux-gnu-g++ + else ifeq ($(TARGET_ARCH),ppc64le) + HOST_COMPILER ?= powerpc64le-linux-gnu-g++ + endif +endif +HOST_COMPILER ?= g++ +NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) + +# internal flags +NVCCFLAGS := -m${TARGET_SIZE} +CCFLAGS := +LDFLAGS := + +# build flags +ifeq ($(TARGET_OS),darwin) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(HOST_ARCH) +else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard +else ifeq ($(TARGET_OS),android) + LDFLAGS += -pie + CCFLAGS += -fpie -fpic -fexceptions +endif + +ifneq ($(TARGET_ARCH),$(HOST_ARCH)) + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + ifneq ($(TARGET_FS),) + GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) + ifeq ($(GCCVERSIONLTEQ46),1) + CCFLAGS += --sysroot=$(TARGET_FS) + endif + LDFLAGS += --sysroot=$(TARGET_FS) + LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib + LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu + LDFLAGS += --unresolved-symbols=ignore-in-shared-libs + CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include + CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu + endif + endif + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu + LDFLAGS += -lsocket + LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu + ifneq ($(TARGET_FS),) + LDFLAGS += -rpath=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib + LDFLAGS += -rpath=$(TARGET_FS)/usr/libnvidia -L $(TARGET_FS)/usr/libnvidia + endif + ifdef TARGET_OVERRIDE # cuda toolkit targets override + NVCCFLAGS += -target-dir $(TARGET_OVERRIDE) + endif + endif +endif + +# Install directory of different arch +CUDA_INSTALL_TARGET_DIR := +ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux) + CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/ +else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/ +else ifeq ($(TARGET_ARCH),ppc64le) + CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/ +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + BUILD_TYPE := debug +else + BUILD_TYPE := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu) + +SAMPLE_ENABLED := 1 + +# This sample is not supported on Mac OSX +ifeq ($(TARGET_OS),darwin) + $(info >>> WARNING - cudaCompressibleMemory is not supported on Mac OSX - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on ARMv7 +ifeq ($(TARGET_ARCH),armv7l) + $(info >>> WARNING - cudaCompressibleMemory is not supported on ARMv7 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on aarch64 +ifeq ($(TARGET_ARCH),aarch64) + $(info >>> WARNING - cudaCompressibleMemory is not supported on aarch64 - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +# This sample is not supported on sbsa +ifeq ($(TARGET_ARCH),sbsa) + $(info >>> WARNING - cudaCompressibleMemory is not supported on sbsa - waiving sample <<<) + SAMPLE_ENABLED := 0 +endif + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../Common +LIBRARIES := + +################################################################################ + +# Gencode arguments +ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64)) +SMS ?= 35 37 50 52 60 61 70 72 75 80 +else +SMS ?= 35 37 50 52 60 61 70 75 80 +endif + +ifeq ($(SMS),) +$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) +SAMPLE_ENABLED := 0 +endif + +ifeq ($(GENCODE_FLAGS),) +# Generate SASS code for each SM architecture listed in $(SMS) +$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) + +# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility +HIGHEST_SM := $(lastword $(sort $(SMS))) +ifneq ($(HIGHEST_SM),) +GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) +endif +endif + +ifeq ($(TARGET_OS),darwin) + ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA +else + ifeq ($(TARGET_ARCH),x86_64) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs + CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs + endif + + ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs + ifdef TARGET_OVERRIDE + CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs + endif + endif + + ifeq ($(TARGET_ARCH),ppc64le) + CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs + endif + + ifeq ($(HOST_ARCH),ppc64le) + CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs + endif + + CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null) + ifeq ("$(CUDALIB)","") + $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<) + SAMPLE_ENABLED := 0 + else + CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" ) + LIBRARIES += -L$(CUDALIB) -lcuda + endif +endif + +LIBRARIES += -lcudart_static + +ifeq ($(SAMPLE_ENABLED),0) +EXEC ?= @echo "[@]" +endif + +################################################################################ + +# Target rules +all: build + +build: cudaCompressibleMemory + +check.deps: +ifeq ($(SAMPLE_ENABLED),0) + @echo "Sample will be waived due to the above missing dependencies" +else + @echo "Sample is ready - all dependencies have been met" +endif + +compMalloc.o:compMalloc.cpp + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +saxpy.o:saxpy.cu + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +cudaCompressibleMemory: compMalloc.o saxpy.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) + $(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + $(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) + +run: build + $(EXEC) ./cudaCompressibleMemory + +clean: + rm -f cudaCompressibleMemory compMalloc.o saxpy.o + rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cudaCompressibleMemory + +clobber: clean diff --git a/Samples/cudaCompressibleMemory/README.md b/Samples/cudaCompressibleMemory/README.md new file mode 100644 index 00000000..d927f641 --- /dev/null +++ b/Samples/cudaCompressibleMemory/README.md @@ -0,0 +1,71 @@ +# cudaCompressibleMemory - CUDA Compressible Memory + +## Description + +This sample demonstrates the compressible memory allocation using cuMemMap API. + +## Key Concepts + +CUDA Driver API, Compressible Memory, MMAP + +## Supported SM Architectures + +[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) + +## Supported OSes + +Linux, Windows + +## Supported CPU Architecture + +x86_64, ppc64le + +## CUDA APIs involved + +### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html) +cuMemAlloc, cuMemFree, cuDeviceGetAttribute, cuMemGetAllocationGranularity, cuMemAddressReserve, cuMemCreate, cuMemMap, cuMemSetAccess, cuMemUnmap, cuMemAddressFree### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) +cudaMalloc, cudaFree + +## Prerequisites + +Download and install the [CUDA Toolkit 11.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. + +## Build and Run + +### Windows +The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: +``` +*_vs.sln - for Visual Studio +``` +Each individual sample has its own set of solution files in its directory: + +To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. +> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." + +### Linux +The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: +``` +$ cd +$ make +``` +The samples makefiles can take advantage of certain options: +* **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le. + By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
+`$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
+ See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. +* **dbg=1** - build with debug symbols + ``` + $ make dbg=1 + ``` +* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. + ``` + $ make SMS="50 60" + ``` + +* **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. +``` + $ make HOST_COMPILER=g++ +``` + +## References (for more details) + diff --git a/Samples/cudaCompressibleMemory/compMalloc.cpp b/Samples/cudaCompressibleMemory/compMalloc.cpp new file mode 100644 index 00000000..8641b5f9 --- /dev/null +++ b/Samples/cudaCompressibleMemory/compMalloc.cpp @@ -0,0 +1,122 @@ +/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +static int printOnce = 1; + +cudaError_t setProp(CUmemAllocationProp *prop) +{ + CUdevice currentDevice; + if (cuCtxGetDevice(¤tDevice) != CUDA_SUCCESS) + return cudaErrorMemoryAllocation; + + int compressionAvailable = 0; + if (cuDeviceGetAttribute(&compressionAvailable, + CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED, + currentDevice) != CUDA_SUCCESS) + return cudaErrorMemoryAllocation; + + if (printOnce) + { + printf("Generic memory compression support %s\n", + compressionAvailable ? "is available" : "is not available"); + printOnce = 0; + } + + memset(prop, 0, sizeof(CUmemAllocationProp)); + prop->type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop->location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop->location.id = currentDevice; + + if (compressionAvailable) + prop->allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_GENERIC; + + return cudaSuccess; +} + +cudaError_t cudaMallocCompressible(void **adr, size_t size) +{ + CUmemAllocationProp prop = {}; + cudaError_t err = setProp(&prop); + if (err != cudaSuccess) + return err; + + size_t granularity = 0; + if (cuMemGetAllocationGranularity(&granularity, &prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) + return cudaErrorMemoryAllocation; + size = ((size - 1) / granularity + 1) * granularity; + CUdeviceptr dptr; + if (cuMemAddressReserve(&dptr, size, 0, 0, 0) != CUDA_SUCCESS) + return cudaErrorMemoryAllocation; + + CUmemGenericAllocationHandle allocationHandle; + if (cuMemCreate(&allocationHandle, size, &prop, 0) != CUDA_SUCCESS) + return cudaErrorMemoryAllocation; + + if (cuMemMap(dptr, size, 0, allocationHandle, 0) != CUDA_SUCCESS) + return cudaErrorMemoryAllocation; + + if (cuMemRelease(allocationHandle) != CUDA_SUCCESS) + return cudaErrorMemoryAllocation; + + CUmemAccessDesc accessDescriptor; + accessDescriptor.location.id = prop.location.id; + accessDescriptor.location.type = prop.location.type; + accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + + if (cuMemSetAccess(dptr, size, &accessDescriptor, 1) != CUDA_SUCCESS) + return cudaErrorMemoryAllocation; + + *adr = (void *)dptr; + return cudaSuccess; +} + +cudaError_t cudaFreeCompressible(void *ptr, size_t size) +{ + CUmemAllocationProp prop = {}; + cudaError_t err = setProp(&prop); + if (err != cudaSuccess) + return err; + + size_t granularity = 0; + if (cuMemGetAllocationGranularity(&granularity, &prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM) != CUDA_SUCCESS) + return cudaErrorMemoryAllocation; + size = ((size - 1) / granularity + 1) * granularity; + + if (ptr == NULL) + return cudaSuccess; + if (cuMemUnmap((CUdeviceptr)ptr, size) != CUDA_SUCCESS || + cuMemAddressFree((CUdeviceptr)ptr, size) != CUDA_SUCCESS) + return cudaErrorInvalidValue; + return cudaSuccess; +} diff --git a/Samples/cudaCompressibleMemory/compMalloc.h b/Samples/cudaCompressibleMemory/compMalloc.h new file mode 100644 index 00000000..de72cce0 --- /dev/null +++ b/Samples/cudaCompressibleMemory/compMalloc.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef COMP_MALLOC_H +#define COMP_MALLOC_H + +cudaError_t cudaMallocCompressible(void **adr, size_t size); +cudaError_t cudaFreeCompressible(void *ptr, size_t size); + +#endif diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2012.sln b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2012.sln new file mode 100644 index 00000000..8fe36227 --- /dev/null +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2012.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cudaCompressibleMemory", "cudaCompressibleMemory_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2012.vcxproj b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2012.vcxproj new file mode 100644 index 00000000..65e4a683 --- /dev/null +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2012.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + cudaCompressibleMemory_vs2012 + cudaCompressibleMemory + + + + + Application + MultiByte + v110 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cuda.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/cudaCompressibleMemory.exe + + + compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + + diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2013.sln b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2013.sln new file mode 100644 index 00000000..e19042ac --- /dev/null +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2013.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 13.00 +# Visual Studio 2013 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cudaCompressibleMemory", "cudaCompressibleMemory_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2013.vcxproj b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2013.vcxproj new file mode 100644 index 00000000..db459933 --- /dev/null +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2013.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + cudaCompressibleMemory_vs2013 + cudaCompressibleMemory + + + + + Application + MultiByte + v120 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cuda.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/cudaCompressibleMemory.exe + + + compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + + diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2015.sln b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2015.sln new file mode 100644 index 00000000..e295029e --- /dev/null +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2015.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 14.00 +# Visual Studio 2015 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cudaCompressibleMemory", "cudaCompressibleMemory_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2015.vcxproj b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2015.vcxproj new file mode 100644 index 00000000..706e5d54 --- /dev/null +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2015.vcxproj @@ -0,0 +1,108 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + cudaCompressibleMemory_vs2015 + cudaCompressibleMemory + + + + + Application + MultiByte + v140 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cuda.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/cudaCompressibleMemory.exe + + + compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + + diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.sln b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.sln new file mode 100644 index 00000000..aab20f2a --- /dev/null +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2017 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cudaCompressibleMemory", "cudaCompressibleMemory_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj new file mode 100644 index 00000000..7294e0b9 --- /dev/null +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2017.vcxproj @@ -0,0 +1,113 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + cudaCompressibleMemory_vs2017 + cudaCompressibleMemory + + + + $([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0')) + $(LatestTargetPlatformVersion) + $(WindowsTargetPlatformVersion) + + + + Application + MultiByte + v141 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cuda.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/cudaCompressibleMemory.exe + + + compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + + diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.sln b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.sln new file mode 100644 index 00000000..1e098bd8 --- /dev/null +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2019 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cudaCompressibleMemory", "cudaCompressibleMemory_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 + {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj new file mode 100644 index 00000000..4ae71760 --- /dev/null +++ b/Samples/cudaCompressibleMemory/cudaCompressibleMemory_vs2019.vcxproj @@ -0,0 +1,109 @@ + + + + $(VCTargetsPath)\BuildCustomizations + + + + Debug + x64 + + + Release + x64 + + + + {997E0757-EA74-4A4E-A0FC-47D8C8831A15} + cudaCompressibleMemory_vs2019 + cudaCompressibleMemory + + + + + Application + MultiByte + v142 + 10.0 + + + true + + + true + + + + + + + + + + + $(Platform)/$(Configuration)/ + $(IncludePath) + AllRules.ruleset + + + + + ../../bin/win64/$(Configuration)/ + + + + Level3 + WIN32;_MBCS;%(PreprocessorDefinitions) + ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); + + + Console + cuda.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + $(CudaToolkitLibDir); + $(OutDir)/cudaCompressibleMemory.exe + + + compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80; + -Xcompiler "/wd 4819" %(AdditionalOptions) + ./;../../Common + WIN32 + + + + + Disabled + MultiThreadedDebug + + + true + Default + + + MTd + 64 + + + + + MaxSpeed + MultiThreaded + + + false + UseLinkTimeCodeGeneration + + + MT + 64 + + + + + + + + + + + + diff --git a/Samples/cudaCompressibleMemory/saxpy.cu b/Samples/cudaCompressibleMemory/saxpy.cu new file mode 100644 index 00000000..b744a98a --- /dev/null +++ b/Samples/cudaCompressibleMemory/saxpy.cu @@ -0,0 +1,148 @@ +/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// +// This sample uses the compressible memory allocation if device supports it +// and performs saxpy on it. +// Compressible memory may give better performance if the data is amenable to +// compression. + +#include +#include +#define CUDA_DRIVER_API +#include "helper_cuda.h" +#include "compMalloc.h" + +__global__ void saxpy(float a, float4 *x, float4 *y, float4 *z, int64_t n) +{ + int64_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + z[i] = make_float4(a * x[i].x + y[i].x, + a * x[i].y + y[i].y, + a * x[i].z + y[i].z, + a * x[i].w + y[i].w); +} + +__global__ void init(float4 *x, float4 *y, float4 *z, float val, int64_t n) +{ + int64_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + { + x[i] = make_float4(val, val, val, val); + y[i] = make_float4(val, val, val, val); + z[i] = make_float4(val, val, val, val); + } +} + +int main(int argc, char **argv) +{ + int devId, UseCompressibleMemory = 1; + int64_t n = 10485760; + + if (checkCmdLineFlag(argc, (const char **)argv, "help") || + checkCmdLineFlag(argc, (const char **)argv, "?")) { + printf("Usage -device=n (n >= 0 for deviceID)\n"); + printf(" -UseCompressibleMemory=0 or 1 (default is 1 : Use compressible memory)\n"); + exit(EXIT_SUCCESS); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "UseCompressibleMemory")) { + UseCompressibleMemory = getCmdLineArgumentInt(argc, (const char **)argv, "UseCompressibleMemory"); + if (UseCompressibleMemory > 1) { + printf("Permitted options for UseCompressibleMemory are 0 or 1, you have entered %d \n", UseCompressibleMemory); + exit(EXIT_WAIVED); + } + } + + devId = findCudaDevice(argc, (const char**)argv); + CUdevice currentDevice; + checkCudaErrors(cuCtxGetDevice(¤tDevice)); + + // Check that the selected device supports virtual address management + int vam_supported = -1; + checkCudaErrors(cuDeviceGetAttribute(&vam_supported, + CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, + currentDevice)); + printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", currentDevice, vam_supported); + if (vam_supported == 0) { + printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT, so not using compressible memory.\n", currentDevice); + UseCompressibleMemory = 0; + } + + int nsm = 0; + checkCudaErrors(cudaDeviceGetAttribute(&nsm, cudaDevAttrMultiProcessorCount, devId)); + printf("Found %d SMs on the device\n", nsm); + + float4 *x, *y, *z; + size_t size = n * sizeof(float4); + if (UseCompressibleMemory) { + checkCudaErrors(cudaMallocCompressible((void **)&x, size)); + checkCudaErrors(cudaMallocCompressible((void **)&y, size)); + checkCudaErrors(cudaMallocCompressible((void **)&z, size)); + } + else { + printf("Using non compressible memory\n"); + checkCudaErrors(cudaMalloc((void **)&x, size)); + checkCudaErrors(cudaMalloc((void **)&y, size)); + checkCudaErrors(cudaMalloc((void **)&z, size)); + } + + printf("Running saxpy on %lu bytes\n", size); + + cudaEvent_t start, stop; + float ms; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + dim3 threads(1024, 1, 1); + dim3 blocks; + + init<<>>(x, y, z, 1.0f, n); + checkCudaErrors(cudaDeviceSynchronize()); + + // Running with single element per thread, lots of blocks + blocks = dim3(n / threads.x, 1, 1); + checkCudaErrors(cudaEventRecord(start)); + saxpy<<>>(1.0f, x, y, z, n); + checkCudaErrors(cudaEventRecord(stop)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&ms, start, stop)); + printf("Running saxpy with %d blocks x %d threads = %.3f ms %.3f TB/s\n", blocks.x, threads.x, ms, (size*3)/ms/1e9); + + if (UseCompressibleMemory) { + checkCudaErrors(cudaFreeCompressible(x, size)); + checkCudaErrors(cudaFreeCompressible(y, size)); + checkCudaErrors(cudaFreeCompressible(z, size)); + } + else { + checkCudaErrors(cudaFree(x)); + checkCudaErrors(cudaFree(y)); + checkCudaErrors(cudaFree(z)); + } + + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/Samples/simpleD3D12/ShaderStructs.h b/Samples/simpleD3D12/ShaderStructs.h index a656994d..80bd1d98 100755 --- a/Samples/simpleD3D12/ShaderStructs.h +++ b/Samples/simpleD3D12/ShaderStructs.h @@ -27,9 +27,12 @@ #pragma once +#include +#include +#include +#include #include #include "helper_cuda.h" -#include "stdafx.h" using namespace DirectX; @@ -38,15 +41,6 @@ struct Vertex { XMFLOAT4 color; }; -#if 0 -// Constant buffer used to send MVP matrices to the vertex shader. -struct ModelViewProjectionConstantBuffer -{ - XMFLOAT4X4 model; - XMFLOAT4X4 view; - XMFLOAT4X4 projection; -}; -#endif void RunSineWaveKernel(size_t mesh_width, size_t mesh_height, Vertex *cudaDevVertptr, cudaStream_t streamToRun, diff --git a/Samples/simpleD3D12/simpleD3D12.cpp b/Samples/simpleD3D12/simpleD3D12.cpp index 9df21d63..5eb64764 100755 --- a/Samples/simpleD3D12/simpleD3D12.cpp +++ b/Samples/simpleD3D12/simpleD3D12.cpp @@ -25,8 +25,16 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include -#include "stdafx.h" +#include "d3dx12.h" + +#include +#include +#include + +#include +#include "ShaderStructs.h" #include "simpleD3D12.h" #include diff --git a/Samples/simpleD3D12/sinewave_cuda.cu b/Samples/simpleD3D12/sinewave_cuda.cu index 282ee393..6f3adf35 100755 --- a/Samples/simpleD3D12/sinewave_cuda.cu +++ b/Samples/simpleD3D12/sinewave_cuda.cu @@ -25,9 +25,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include "ShaderStructs.h" -#include "helper_cuda.h" __global__ void sinewave_gen_kernel(Vertex *vertices, unsigned int width, unsigned int height, float time) { diff --git a/Samples/simpleVulkanMMAP/main.cpp b/Samples/simpleVulkanMMAP/main.cpp index 110e6111..53c05952 100644 --- a/Samples/simpleVulkanMMAP/main.cpp +++ b/Samples/simpleVulkanMMAP/main.cpp @@ -53,6 +53,8 @@ #define NUM_SIMULATION_POINTS 50000 +std::string execution_path; + class VulkanCudaPi : public VulkanBaseApp { typedef struct UniformBufferObject_st { @@ -86,8 +88,10 @@ public: m_lastFrame(0) { // Add our compiled vulkan shader files - m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, "montecarlo.vert")); - m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, "montecarlo.frag")); + char* vertex_shader_path = sdkFindFilePath("montecarlo.vert", execution_path.c_str()); + char* fragment_shader_path = sdkFindFilePath("montecarlo.frag", execution_path.c_str()); + m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path)); + m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path)); } ~VulkanCudaPi() { @@ -303,8 +307,9 @@ public: } }; -int main() +int main(int argc, char **argv) { + execution_path = argv[0]; VulkanCudaPi app(NUM_SIMULATION_POINTS); app.init(); app.mainLoop();