Add and update samples with CUDA 10.1 support

This commit is contained in:
Mahesh Doijade 2019-01-23 01:34:43 +05:30
parent 32f0fc6111
commit b458dafcd6
201 changed files with 9072 additions and 286 deletions

View File

@ -282,6 +282,42 @@ static const char *_cudaGetErrorEnum(curandStatus_t error) {
}
#endif
#ifdef NVJPEGAPI
// nvJPEG API errors
static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
switch (error) {
case NVJPEG_STATUS_SUCCESS:
return "NVJPEG_STATUS_SUCCESS";
case NVJPEG_STATUS_NOT_INITIALIZED:
return "NVJPEG_STATUS_NOT_INITIALIZED";
case NVJPEG_STATUS_INVALID_PARAMETER:
return "NVJPEG_STATUS_INVALID_PARAMETER";
case NVJPEG_STATUS_BAD_JPEG:
return "NVJPEG_STATUS_BAD_JPEG";
case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
case NVJPEG_STATUS_ALLOCATOR_FAILURE:
return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
case NVJPEG_STATUS_EXECUTION_FAILED:
return "NVJPEG_STATUS_EXECUTION_FAILED";
case NVJPEG_STATUS_ARCH_MISMATCH:
return "NVJPEG_STATUS_ARCH_MISMATCH";
case NVJPEG_STATUS_INTERNAL_ERROR:
return "NVJPEG_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
#endif
#ifdef NV_NPPIDEFS_H
// NPP API errors
static const char *_cudaGetErrorEnum(NppStatus error) {

View File

@ -0,0 +1,178 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "helper_multiprocess.h"
#include <cstdlib>
#include <string>
int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info)
{
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
info->size = sz;
info->shmHandle = CreateFileMapping(INVALID_HANDLE_VALUE,
NULL,
PAGE_READWRITE,
0,
(DWORD)sz,
name);
if (info->shmHandle == 0) {
return GetLastError();
}
info->addr = MapViewOfFile(info->shmHandle, FILE_MAP_ALL_ACCESS, 0, 0, sz);
if (info->addr == NULL) {
return GetLastError();
}
return 0;
#else
int status = 0;
info->size = sz;
info->shmFd = shm_open(name, O_RDWR | O_CREAT, 0777);
if (info->shmFd < 0) {
return errno;
}
status = ftruncate(info->shmFd, sz);
if (status != 0) {
return status;
}
info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0);
if (info->addr == NULL) {
return errno;
}
return 0;
#endif
}
int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info)
{
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
info->size = sz;
info->shmHandle = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, name);
if (info->shmHandle == 0) {
return GetLastError();
}
info->addr = MapViewOfFile(info->shmHandle, FILE_MAP_ALL_ACCESS, 0, 0, sz);
if (info->addr == NULL) {
return GetLastError();
}
return 0;
#else
info->size = sz;
info->shmFd = shm_open(name, O_RDWR, 0777);
if (info->shmFd < 0) {
return errno;
}
info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0);
if (info->addr == NULL) {
return errno;
}
return 0;
#endif
}
void sharedMemoryClose(sharedMemoryInfo *info)
{
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
if (info->addr) {
UnmapViewOfFile(info->addr);
}
if (info->shmHandle) {
CloseHandle(info->shmHandle);
}
#else
if (info->addr) {
munmap(info->addr, info->size);
}
if (info->shmFd) {
close(info->shmFd);
}
#endif
}
int spawnProcess(Process *process, const char *app, char * const *args)
{
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
STARTUPINFO si = {0};
BOOL status;
size_t arglen = 0;
size_t argIdx = 0;
std::string arg_string;
memset(process, 0, sizeof(*process));
while (*args) {
arg_string.append(*args).append(1, ' ');
args++;
}
status = CreateProcess(app, LPSTR(arg_string.c_str()), NULL, NULL, FALSE, 0, NULL, NULL, &si, process);
return status ? 0 : GetLastError();
#else
*process = fork();
if (*process == 0) {
if (0 > execvp(app, args)) {
return errno;
}
}
else if (*process < 0) {
return errno;
}
return 0;
#endif
}
int waitProcess(Process *process)
{
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
DWORD exitCode;
WaitForSingleObject(process->hProcess, INFINITE);
GetExitCodeProcess(process->hProcess, &exitCode);
CloseHandle(process->hProcess);
CloseHandle(process->hThread);
return (int)exitCode;
#else
int status = 0;
do {
if (0 > waitpid(*process, &status, 0)) {
return errno;
}
} while (!WIFEXITED(status));
return WEXITSTATUS(status);
#endif
}

View File

@ -0,0 +1,71 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef HELPER_MULTIPROCESS_H
#define HELPER_MULTIPROCESS_H
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
#else
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#include <sys/wait.h>
#endif
typedef struct sharedMemoryInfo_st {
void *addr;
size_t size;
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
HANDLE shmHandle;
#else
int shmFd;
#endif
} sharedMemoryInfo;
int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info);
int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info);
void sharedMemoryClose(sharedMemoryInfo *info);
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
typedef PROCESS_INFORMATION Process;
#else
typedef pid_t Process;
#endif
int spawnProcess(Process *process, const char *app, char * const *args);
int waitProcess(Process *process);
#endif // HELPER_MULTIPROCESS_H

View File

@ -234,6 +234,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - UnifiedMemoryPerf is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@ -246,7 +252,11 @@ LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
SMS ?= 30 35 37 50 52 60 61 70 72 75
else
SMS ?= 30 35 37 50 52 60 61 70 75
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)

View File

@ -52,6 +52,7 @@
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>

View File

@ -10,7 +10,7 @@ CUDA Systems Integration, Unified Memory, CUDA Streams and Events, Pinned System
## Supported SM Architectures
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
@ -30,7 +30,7 @@ cudaMallocManaged, cudaStreamAttachMemAsync, cudaMemcpyAsync, cudaMallocHost, cu
## Prerequisites
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -105,6 +105,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -105,6 +105,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -105,6 +105,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,304 @@
################################################################################
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
endif
ifeq ($(TARGET_OS),qnx)
CCFLAGS += -DWIN_INTERFACE_CUSTOM
LDFLAGS += -lsocket
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../Common
LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
SMS ?= 30 35 37 50 52 60 61 70 72 75
else
SMS ?= 30 35 37 50 52 60 61 70 75
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: bandwidthTest
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
bandwidthTest.o:bandwidthTest.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
bandwidthTest: bandwidthTest.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./bandwidthTest
clean:
rm -f bandwidthTest bandwidthTest.o
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/bandwidthTest
clobber: clean

View File

@ -0,0 +1,79 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>bandwidthTest</name>
<cuda_api_list>
<toolkit>cudaSetDevice</toolkit>
<toolkit>cudaHostAlloc</toolkit>
<toolkit>cudaFree</toolkit>
<toolkit>cudaMallocHost</toolkit>
<toolkit>cudaFreeHost</toolkit>
<toolkit>cudaMemcpy</toolkit>
<toolkit>cudaMemcpyAsync</toolkit>
<toolkit>cudaEventCreate</toolkit>
<toolkit>cudaEventRecord</toolkit>
<toolkit>cudaEventDestroy</toolkit>
<toolkit>cudaDeviceSynchronize</toolkit>
<toolkit>cudaEventElapsedTime</toolkit>
</cuda_api_list>
<description><![CDATA[This is a simple test program to measure the memcopy bandwidth of the GPU and memcpy bandwidth across PCI-e. This test application is capable of measuring device to device copy bandwidth, host to device copy bandwidth for pageable and page-locked memory, and device to host copy bandwidth for pageable and page-locked memory.]]></description>
<devicecompilation>whole</devicecompilation>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../common/inc</path>
</includepaths>
<keyconcepts>
<concept level="basic">CUDA Streams and Events</concept>
<concept level="basic">Performance Strategies</concept>
</keyconcepts>
<keywords>
<keyword>GPGPU</keyword>
<keyword>bandwidth</keyword>
</keywords>
<libraries>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>bandwidthTest.cu</primary_file>
<scopes>
<scope>1:CUDA Basic Topics</scope>
<scope>1:Performance Strategies</scope>
</scopes>
<sm-arch>sm30</sm-arch>
<sm-arch>sm35</sm-arch>
<sm-arch>sm37</sm-arch>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<platform>windows7</platform>
</env>
<env>
<arch>x86_64</arch>
<platform>macosx</platform>
</env>
<env>
<arch>arm</arch>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<include>all</include>
</supported_sm_architectures>
<title>Bandwidth Test</title>
<type>exe</type>
</entry>

View File

@ -0,0 +1,94 @@
# bandwidthTest - Bandwidth Test
## Description
This is a simple test program to measure the memcopy bandwidth of the GPU and memcpy bandwidth across PCI-e. This test application is capable of measuring device to device copy bandwidth, host to device copy bandwidth for pageable and page-locked memory, and device to host copy bandwidth for pageable and page-locked memory.
## Key Concepts
CUDA Streams and Events, Performance Strategies
## Supported SM Architectures
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
Linux, Windows, MacOSX
## Supported CPU Architecture
x86_64, ppc64le, armv7l
## CUDA APIs involved
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
cudaSetDevice, cudaHostAlloc, cudaFree, cudaMallocHost, cudaFreeHost, cudaMemcpy, cudaMemcpyAsync, cudaEventCreate, cudaEventRecord, cudaEventDestroy, cudaDeviceSynchronize, cudaEventElapsedTime
## Prerequisites
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
### Mac
The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
```
$ make SMS="A B ..."
```
* **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=clang
```
## References (for more details)

View File

@ -0,0 +1,969 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This is a simple test program to measure the memcopy bandwidth of the GPU.
* It can measure device to device copy bandwidth, host to device copy bandwidth
* for pageable and pinned memory, and device to host copy bandwidth for
* pageable and pinned memory.
*
* Usage:
* ./bandwidthTest [option]...
*/
// CUDA runtime
#include <cuda_runtime.h>
// includes
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
#include <cuda.h>
#include <cassert>
#include <iostream>
#include <memory>
static const char *sSDKsample = "CUDA Bandwidth Test";
// defines, project
#define MEMCOPY_ITERATIONS 100
#define DEFAULT_SIZE (32 * (1e6)) // 32 M
#define DEFAULT_INCREMENT (4 * (1e6)) // 4 M
#define CACHE_CLEAR_SIZE (16 * (1e6)) // 16 M
// shmoo mode defines
#define SHMOO_MEMSIZE_MAX (64 * (1e6)) // 64 M
#define SHMOO_MEMSIZE_START (1e3) // 1 KB
#define SHMOO_INCREMENT_1KB (1e3) // 1 KB
#define SHMOO_INCREMENT_2KB (2 * 1e3) // 2 KB
#define SHMOO_INCREMENT_10KB (10 * (1e3)) // 10KB
#define SHMOO_INCREMENT_100KB (100 * (1e3)) // 100 KB
#define SHMOO_INCREMENT_1MB (1e6) // 1 MB
#define SHMOO_INCREMENT_2MB (2 * 1e6) // 2 MB
#define SHMOO_INCREMENT_4MB (4 * 1e6) // 4 MB
#define SHMOO_LIMIT_20KB (20 * (1e3)) // 20 KB
#define SHMOO_LIMIT_50KB (50 * (1e3)) // 50 KB
#define SHMOO_LIMIT_100KB (100 * (1e3)) // 100 KB
#define SHMOO_LIMIT_1MB (1e6) // 1 MB
#define SHMOO_LIMIT_16MB (16 * 1e6) // 16 MB
#define SHMOO_LIMIT_32MB (32 * 1e6) // 32 MB
// CPU cache flush
#define FLUSH_SIZE (256 * 1024 * 1024)
char *flush_buf;
// enums, project
enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
enum printMode { USER_READABLE, CSV };
enum memoryMode { PINNED, PAGEABLE };
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device",
"Device to Device", NULL};
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
// if true, use CPU based timing for everything
static bool bDontUseGPUTiming;
int *pArgc = NULL;
char **pArgv = NULL;
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
int runTest(const int argc, const char **argv);
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
testMode mode, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice, bool wc);
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc);
void testBandwidthRange(unsigned int start, unsigned int end,
unsigned int increment, memcpyKind kind,
printMode printmode, memoryMode memMode,
int startDevice, int endDevice, bool wc);
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc);
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
bool wc);
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
bool wc);
float testDeviceToDeviceTransfer(unsigned int memSize);
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind,
memoryMode memMode, int iNumDevs, bool wc);
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind, memoryMode memMode,
int iNumDevs, bool wc);
void printHelp(void);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
pArgc = &argc;
pArgv = argv;
flush_buf = (char *)malloc(FLUSH_SIZE);
// set logfile name and start logs
printf("[%s] - Starting...\n", sSDKsample);
int iRetVal = runTest(argc, (const char **)argv);
if (iRetVal < 0) {
checkCudaErrors(cudaSetDevice(0));
}
// finish
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
printf(
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
free(flush_buf);
exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE);
}
///////////////////////////////////////////////////////////////////////////////
// Parse args, run the appropriate tests
///////////////////////////////////////////////////////////////////////////////
int runTest(const int argc, const char **argv) {
int start = DEFAULT_SIZE;
int end = DEFAULT_SIZE;
int startDevice = 0;
int endDevice = 0;
int increment = DEFAULT_INCREMENT;
testMode mode = QUICK_MODE;
bool htod = false;
bool dtoh = false;
bool dtod = false;
bool wc = false;
char *modeStr;
char *device = NULL;
printMode printmode = USER_READABLE;
char *memModeStr = NULL;
memoryMode memMode = PINNED;
// process command line args
if (checkCmdLineFlag(argc, argv, "help")) {
printHelp();
return 0;
}
if (checkCmdLineFlag(argc, argv, "csv")) {
printmode = CSV;
}
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
if (strcmp(memModeStr, "pageable") == 0) {
memMode = PAGEABLE;
} else if (strcmp(memModeStr, "pinned") == 0) {
memMode = PINNED;
} else {
printf("Invalid memory mode - valid modes are pageable or pinned\n");
printf("See --help for more information\n");
return -1000;
}
} else {
// default - pinned memory
memMode = PINNED;
}
if (getCmdLineArgumentString(argc, argv, "device", &device)) {
int deviceCount;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess) {
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id,
cudaGetErrorString(error_id));
exit(EXIT_FAILURE);
}
if (deviceCount == 0) {
printf("!!!!!No devices found!!!!!\n");
return -2000;
}
if (strcmp(device, "all") == 0) {
printf(
"\n!!!!!Cumulative Bandwidth to be computed from all the devices "
"!!!!!!\n\n");
startDevice = 0;
endDevice = deviceCount - 1;
} else {
startDevice = endDevice = atoi(device);
if (startDevice >= deviceCount || startDevice < 0) {
printf(
"\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
"used !!!!!\n",
startDevice, 0);
startDevice = endDevice = 0;
}
}
}
printf("Running on...\n\n");
for (int currentDevice = startDevice; currentDevice <= endDevice;
currentDevice++) {
cudaDeviceProp deviceProp;
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
if (error_id == cudaSuccess) {
printf(" Device %d: %s\n", currentDevice, deviceProp.name);
if (deviceProp.computeMode == cudaComputeModeProhibited) {
fprintf(stderr,
"Error: device is running in <Compute Mode Prohibited>, no "
"threads can use ::cudaSetDevice().\n");
checkCudaErrors(cudaSetDevice(currentDevice));
exit(EXIT_FAILURE);
}
} else {
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id,
cudaGetErrorString(error_id));
checkCudaErrors(cudaSetDevice(currentDevice));
exit(EXIT_FAILURE);
}
}
if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) {
// figure out the mode
if (strcmp(modeStr, "quick") == 0) {
printf(" Quick Mode\n\n");
mode = QUICK_MODE;
} else if (strcmp(modeStr, "shmoo") == 0) {
printf(" Shmoo Mode\n\n");
mode = SHMOO_MODE;
} else if (strcmp(modeStr, "range") == 0) {
printf(" Range Mode\n\n");
mode = RANGE_MODE;
} else {
printf("Invalid mode - valid modes are quick, range, or shmoo\n");
printf("See --help for more information\n");
return -3000;
}
} else {
// default mode - quick
printf(" Quick Mode\n\n");
mode = QUICK_MODE;
}
if (checkCmdLineFlag(argc, argv, "htod")) {
htod = true;
}
if (checkCmdLineFlag(argc, argv, "dtoh")) {
dtoh = true;
}
if (checkCmdLineFlag(argc, argv, "dtod")) {
dtod = true;
}
#if CUDART_VERSION >= 2020
if (checkCmdLineFlag(argc, argv, "wc")) {
wc = true;
}
#endif
if (checkCmdLineFlag(argc, argv, "cputiming")) {
bDontUseGPUTiming = true;
}
if (!htod && !dtoh && !dtod) {
// default: All
htod = true;
dtoh = true;
dtod = true;
}
if (RANGE_MODE == mode) {
if (checkCmdLineFlag(argc, (const char **)argv, "start")) {
start = getCmdLineArgumentInt(argc, argv, "start");
if (start <= 0) {
printf("Illegal argument - start must be greater than zero\n");
return -4000;
}
} else {
printf("Must specify a starting size in range mode\n");
printf("See --help for more information\n");
return -5000;
}
if (checkCmdLineFlag(argc, (const char **)argv, "end")) {
end = getCmdLineArgumentInt(argc, argv, "end");
if (end <= 0) {
printf("Illegal argument - end must be greater than zero\n");
return -6000;
}
if (start > end) {
printf("Illegal argument - start is greater than end\n");
return -7000;
}
} else {
printf("Must specify an end size in range mode.\n");
printf("See --help for more information\n");
return -8000;
}
if (checkCmdLineFlag(argc, argv, "increment")) {
increment = getCmdLineArgumentInt(argc, argv, "increment");
if (increment <= 0) {
printf("Illegal argument - increment must be greater than zero\n");
return -9000;
}
} else {
printf("Must specify an increment in user mode\n");
printf("See --help for more information\n");
return -10000;
}
}
if (htod) {
testBandwidth((unsigned int)start, (unsigned int)end,
(unsigned int)increment, mode, HOST_TO_DEVICE, printmode,
memMode, startDevice, endDevice, wc);
}
if (dtoh) {
testBandwidth((unsigned int)start, (unsigned int)end,
(unsigned int)increment, mode, DEVICE_TO_HOST, printmode,
memMode, startDevice, endDevice, wc);
}
if (dtod) {
testBandwidth((unsigned int)start, (unsigned int)end,
(unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode,
memMode, startDevice, endDevice, wc);
}
// Ensure that we reset all CUDA Devices in question
for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) {
cudaSetDevice(nDevice);
}
return 0;
}
///////////////////////////////////////////////////////////////////////////////
// Run a bandwidth test
///////////////////////////////////////////////////////////////////////////////
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
testMode mode, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc) {
switch (mode) {
case QUICK_MODE:
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice,
endDevice, wc);
break;
case RANGE_MODE:
testBandwidthRange(start, end, increment, kind, printmode, memMode,
startDevice, endDevice, wc);
break;
case SHMOO_MODE:
testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc);
break;
default:
break;
}
}
//////////////////////////////////////////////////////////////////////
// Run a quick mode bandwidth test
//////////////////////////////////////////////////////////////////////
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc) {
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode,
startDevice, endDevice, wc);
}
///////////////////////////////////////////////////////////////////////
// Run a range mode bandwidth test
//////////////////////////////////////////////////////////////////////
void testBandwidthRange(unsigned int start, unsigned int end,
unsigned int increment, memcpyKind kind,
printMode printmode, memoryMode memMode,
int startDevice, int endDevice, bool wc) {
// count the number of copies we're going to run
unsigned int count = 1 + ((end - start) / increment);
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
double *bandwidths = (double *)malloc(count * sizeof(double));
// Before calculating the cumulative bandwidth, initialize bandwidths array to
// NULL
for (unsigned int i = 0; i < count; i++) {
bandwidths[i] = 0.0;
}
// Use the device asked by the user
for (int currentDevice = startDevice; currentDevice <= endDevice;
currentDevice++) {
cudaSetDevice(currentDevice);
// run each of the copies
for (unsigned int i = 0; i < count; i++) {
memSizes[i] = start + i * increment;
switch (kind) {
case DEVICE_TO_HOST:
bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
break;
case HOST_TO_DEVICE:
bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
break;
case DEVICE_TO_DEVICE:
bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
break;
}
}
} // Complete the bandwidth computation on all the devices
// print results
if (printmode == CSV) {
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
} else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
}
// clean up
free(memSizes);
free(bandwidths);
}
//////////////////////////////////////////////////////////////////////////////
// Intense shmoo mode - covers a large range of values with varying increments
//////////////////////////////////////////////////////////////////////////////
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc) {
// count the number of copies to make
unsigned int count =
1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) +
((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) +
((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) +
((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) +
((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) +
((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) +
((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
double *bandwidths = (double *)malloc(count * sizeof(double));
// Before calculating the cumulative bandwidth, initialize bandwidths array to
// NULL
for (unsigned int i = 0; i < count; i++) {
bandwidths[i] = 0.0;
}
// Use the device asked by the user
for (int currentDevice = startDevice; currentDevice <= endDevice;
currentDevice++) {
cudaSetDevice(currentDevice);
// Run the shmoo
int iteration = 0;
unsigned int memSize = 0;
while (memSize <= SHMOO_MEMSIZE_MAX) {
if (memSize < SHMOO_LIMIT_20KB) {
memSize += SHMOO_INCREMENT_1KB;
} else if (memSize < SHMOO_LIMIT_50KB) {
memSize += SHMOO_INCREMENT_2KB;
} else if (memSize < SHMOO_LIMIT_100KB) {
memSize += SHMOO_INCREMENT_10KB;
} else if (memSize < SHMOO_LIMIT_1MB) {
memSize += SHMOO_INCREMENT_100KB;
} else if (memSize < SHMOO_LIMIT_16MB) {
memSize += SHMOO_INCREMENT_1MB;
} else if (memSize < SHMOO_LIMIT_32MB) {
memSize += SHMOO_INCREMENT_2MB;
} else {
memSize += SHMOO_INCREMENT_4MB;
}
memSizes[iteration] = memSize;
switch (kind) {
case DEVICE_TO_HOST:
bandwidths[iteration] +=
testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
break;
case HOST_TO_DEVICE:
bandwidths[iteration] +=
testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
break;
case DEVICE_TO_DEVICE:
bandwidths[iteration] +=
testDeviceToDeviceTransfer(memSizes[iteration]);
break;
}
iteration++;
printf(".");
fflush(0);
}
} // Complete the bandwidth computation on all the devices
// print results
printf("\n");
if (CSV == printmode) {
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
} else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
}
// clean up
free(memSizes);
free(bandwidths);
}
///////////////////////////////////////////////////////////////////////////////
// test the bandwidth of a device to host memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
bool wc) {
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
unsigned char *h_idata = NULL;
unsigned char *h_odata = NULL;
cudaEvent_t start, stop;
sdkCreateTimer(&timer);
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// allocate host memory
if (PINNED == memMode) {
// pinned memory mode - use special function to get OS-pinned memory
#if CUDART_VERSION >= 2020
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
#else
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
} else {
// pageable memory mode - use malloc
h_idata = (unsigned char *)malloc(memSize);
h_odata = (unsigned char *)malloc(memSize);
if (h_idata == 0 || h_odata == 0) {
fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
exit(EXIT_FAILURE);
}
}
// initialize the memory
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
h_idata[i] = (unsigned char)(i & 0xff);
}
// allocate device memory
unsigned char *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
// initialize the device memory
checkCudaErrors(
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// copy data from GPU to Host
if (PINNED == memMode) {
if (bDontUseGPUTiming) sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
cudaMemcpyDeviceToHost, 0));
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
if (bDontUseGPUTiming) {
sdkStopTimer(&timer);
elapsedTimeInMs = sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
}
} else {
elapsedTimeInMs = 0;
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
sdkStartTimer(&timer);
checkCudaErrors(
cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
memset(flush_buf, i, FLUSH_SIZE);
}
}
// calculate bandwidth in GB/s
double time_s = elapsedTimeInMs / 1e3;
bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
bandwidthInGBs = bandwidthInGBs / time_s;
// clean up memory
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaEventDestroy(start));
sdkDeleteTimer(&timer);
if (PINNED == memMode) {
checkCudaErrors(cudaFreeHost(h_idata));
checkCudaErrors(cudaFreeHost(h_odata));
} else {
free(h_idata);
free(h_odata);
}
checkCudaErrors(cudaFree(d_idata));
return bandwidthInGBs;
}
///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a host to device memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
bool wc) {
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
cudaEvent_t start, stop;
sdkCreateTimer(&timer);
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// allocate host memory
unsigned char *h_odata = NULL;
if (PINNED == memMode) {
#if CUDART_VERSION >= 2020
// pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
#else
// pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
} else {
// pageable memory mode - use malloc
h_odata = (unsigned char *)malloc(memSize);
if (h_odata == 0) {
fprintf(stderr, "Not enough memory available on host to run test!\n");
exit(EXIT_FAILURE);
}
}
unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
if (h_cacheClear1 == 0 || h_cacheClear2 == 0) {
fprintf(stderr, "Not enough memory available on host to run test!\n");
exit(EXIT_FAILURE);
}
// initialize the memory
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
h_odata[i] = (unsigned char)(i & 0xff);
}
for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) {
h_cacheClear1[i] = (unsigned char)(i & 0xff);
h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff));
}
// allocate device memory
unsigned char *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
// copy host memory to device memory
if (PINNED == memMode) {
if (bDontUseGPUTiming) sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
cudaMemcpyHostToDevice, 0));
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
if (bDontUseGPUTiming) {
sdkStopTimer(&timer);
elapsedTimeInMs = sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
}
} else {
elapsedTimeInMs = 0;
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
sdkStartTimer(&timer);
checkCudaErrors(
cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
memset(flush_buf, i, FLUSH_SIZE);
}
}
// calculate bandwidth in GB/s
double time_s = elapsedTimeInMs / 1e3;
bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
bandwidthInGBs = bandwidthInGBs / time_s;
// clean up memory
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaEventDestroy(start));
sdkDeleteTimer(&timer);
if (PINNED == memMode) {
checkCudaErrors(cudaFreeHost(h_odata));
} else {
free(h_odata);
}
free(h_cacheClear1);
free(h_cacheClear2);
checkCudaErrors(cudaFree(d_idata));
return bandwidthInGBs;
}
///////////////////////////////////////////////////////////////////////////////
//! test the bandwidth of a device to device memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
float testDeviceToDeviceTransfer(unsigned int memSize) {
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
cudaEvent_t start, stop;
sdkCreateTimer(&timer);
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// allocate host memory
unsigned char *h_idata = (unsigned char *)malloc(memSize);
if (h_idata == 0) {
fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
exit(EXIT_FAILURE);
}
// initialize the host memory
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
h_idata[i] = (unsigned char)(i & 0xff);
}
// allocate device memory
unsigned char *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
unsigned char *d_odata;
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
// initialize memory
checkCudaErrors(
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// run the memcopy
sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(
cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
}
checkCudaErrors(cudaEventRecord(stop, 0));
// Since device to device memory copies are non-blocking,
// cudaDeviceSynchronize() is required in order to get
// proper timing.
checkCudaErrors(cudaDeviceSynchronize());
// get the total elapsed time in ms
sdkStopTimer(&timer);
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
if (bDontUseGPUTiming) {
elapsedTimeInMs = sdkGetTimerValue(&timer);
}
// calculate bandwidth in GB/s
double time_s = elapsedTimeInMs / 1e3;
bandwidthInGBs = (2.0f * memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
bandwidthInGBs = bandwidthInGBs / time_s;
// clean up memory
sdkDeleteTimer(&timer);
free(h_idata);
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaFree(d_idata));
checkCudaErrors(cudaFree(d_odata));
return bandwidthInGBs;
}
/////////////////////////////////////////////////////////
// print results in an easily read format
////////////////////////////////////////////////////////
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind,
memoryMode memMode, int iNumDevs, bool wc) {
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
if (wc) {
printf(" Write-Combined Memory Writes are Enabled");
}
printf(" Transfer Size (Bytes)\tBandwidth(GB/s)\n");
unsigned int i;
for (i = 0; i < (count - 1); i++) {
printf(" %u\t\t\t%s%.1f\n", memSizes[i],
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
}
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i],
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
}
///////////////////////////////////////////////////////////////////////////
// print results in a database format
///////////////////////////////////////////////////////////////////////////
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind, memoryMode memMode,
int iNumDevs, bool wc) {
std::string sConfig;
// log config information
if (kind == DEVICE_TO_DEVICE) {
sConfig += "D2D";
} else {
if (kind == DEVICE_TO_HOST) {
sConfig += "D2H";
} else if (kind == HOST_TO_DEVICE) {
sConfig += "H2D";
}
if (memMode == PAGEABLE) {
sConfig += "-Paged";
} else if (memMode == PINNED) {
sConfig += "-Pinned";
if (wc) {
sConfig += "-WriteCombined";
}
}
}
unsigned int i;
double dSeconds = 0.0;
for (i = 0; i < count; i++) {
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1 << 20));
printf(
"bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
"bytes, NumDevsUsed = %d\n",
sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
}
}
///////////////////////////////////////////////////////////////////////////
// Print help screen
///////////////////////////////////////////////////////////////////////////
void printHelp(void) {
printf("Usage: bandwidthTest [OPTION]...\n");
printf(
"Test the bandwidth for device to host, host to device, and device to "
"device transfers\n");
printf("\n");
printf(
"Example: measure the bandwidth of device to host pinned memory copies "
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
printf(
"./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
"--increment=1024 --dtoh\n");
printf("\n");
printf("Options:\n");
printf("--help\tDisplay this help menu\n");
printf("--csv\tPrint results as a CSV\n");
printf("--device=[deviceno]\tSpecify the device device to be used\n");
printf(" all - compute cumulative bandwidth on all the devices\n");
printf(" 0,1,2,...,n - Specify any particular device to be used\n");
printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");
printf(" pageable - pageable memory\n");
printf(" pinned - non-pageable system memory\n");
printf("--mode=[MODE]\tSpecify the mode to use\n");
printf(" quick - performs a quick measurement\n");
printf(" range - measures a user-specified range of values\n");
printf(" shmoo - performs an intense shmoo of a large range of values\n");
printf("--htod\tMeasure host to device transfers\n");
printf("--dtoh\tMeasure device to host transfers\n");
printf("--dtod\tMeasure device to device transfers\n");
#if CUDART_VERSION >= 2020
printf("--wc\tAllocate pinned memory as write-combined\n");
#endif
printf("--cputiming\tForce CPU-based timing always\n");
printf("Range mode options\n");
printf("--start=[SIZE]\tStarting transfer size in bytes\n");
printf("--end=[SIZE]\tEnding transfer size in bytes\n");
printf("--increment=[SIZE]\tIncrement size in bytes\n");
}

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2012
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,107 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>bandwidthTest_vs2012</RootNamespace>
<ProjectName>bandwidthTest</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v110</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="bandwidthTest.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 13.00
# Visual Studio 2013
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,107 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>bandwidthTest_vs2013</RootNamespace>
<ProjectName>bandwidthTest</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="bandwidthTest.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 14.00
# Visual Studio 2015
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,107 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>bandwidthTest_vs2015</RootNamespace>
<ProjectName>bandwidthTest</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v140</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="bandwidthTest.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2017
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,108 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>bandwidthTest_vs2017</RootNamespace>
<ProjectName>bandwidthTest</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="bandwidthTest.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -246,7 +246,11 @@ LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
SMS ?= 30 35 37 50 52 60 61 70 72 75
else
SMS ?= 30 35 37 50 52 60 61 70 75
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
@ -264,7 +268,7 @@ GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
LIBRARIES += -lcublas_static -lcusparse_static -lculibos
LIBRARIES += -lcublas_static -lcublasLt_static -lcusparse_static -lculibos
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"

View File

@ -31,6 +31,7 @@
</keywords>
<libraries>
<library>cublas_static</library>
<library>cublasLt_static</library>
<library>cusparse_static</library>
<library>culibos</library>
</libraries>
@ -55,6 +56,7 @@
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>

View File

@ -10,7 +10,7 @@ Linear Algebra, CUBLAS Library, CUSPARSE Library
## Supported SM Architectures
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
@ -30,7 +30,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch,
## Prerequisites
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run

View File

@ -323,7 +323,7 @@ int main(int argc, char **argv) {
checkCudaErrors(cudaStreamCreate(&streamForGraph));
checkCudaErrors(cublasSetStream(cublasHandle, stream1));
checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
checkCudaErrors(cudaStreamBeginCapture(stream1));
checkCudaErrors(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal));
r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b);
cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -264,7 +264,11 @@ LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
SMS ?= 60 61 70 72 75
else
SMS ?= 60 61 70 75
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)

View File

@ -42,6 +42,7 @@
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>

View File

@ -10,7 +10,7 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiBlock Cooperative Group
## Supported SM Architectures
[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
@ -27,7 +27,7 @@ x86_64, ppc64le
## Prerequisites
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -264,7 +264,11 @@ LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
SMS ?= 60 61 70 72 75
else
SMS ?= 60 61 70 75
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)

View File

@ -49,6 +49,7 @@
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>

View File

@ -10,7 +10,7 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiDevice Cooperative Grou
## Supported SM Architectures
[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
@ -30,7 +30,7 @@ cudaMemAdvise, cudaMemPrefetchAsync, cudaLaunchCooperativeKernelMultiDevice, cud
## Prerequisites
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run

View File

@ -415,7 +415,7 @@ void getIdenticalGPUs(int num_of_gpus, std::set<int> &identicalGPUs) {
identicalGPUs.erase(it);
}
if (!deviceProp.cooperativeMultiDeviceLaunch ||
!deviceProp.concurrentManagedAccess) {
!deviceProp.managedMemory) {
identicalGPUs.erase(it);
}
it++;
@ -450,8 +450,7 @@ int main(int argc, char **argv) {
if (identicalGPUs.size() <= 1) {
printf(
"No Two or more GPUs with same architecture capable of "
"cooperativeMultiDeviceLaunch & concurrentManagedAccess found. "
"\nWaiving the sample\n");
"cooperativeMultiDeviceLaunch & managedMemory found. \nWaiving the sample\n");
exit(EXIT_WAIVED);
}

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -104,6 +104,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -246,12 +246,6 @@ ifeq ($(TARGET_ARCH),armv7l)
SAMPLE_ENABLED := 0
endif
# This sample is not supported on aarch64
ifeq ($(TARGET_ARCH),aarch64)
$(info >>> WARNING - cudaTensorCoreGemm is not supported on aarch64 - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@ -264,7 +258,11 @@ LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
SMS ?= 70 72 75
else
SMS ?= 70 75
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)

View File

@ -43,12 +43,16 @@ In addition to that, it demonstrates the use of the new CUDA function attribute
<scope>1:CUDA Basic Topics</scope>
</scopes>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<arch>aarch64</arch>
</env>
<env>
<platform>windows7</platform>
</env>

View File

@ -14,7 +14,7 @@ Matrix Multiply, WMMA, Tensor Cores
## Supported SM Architectures
[SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
[SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
@ -22,7 +22,7 @@ Linux, Windows
## Supported CPU Architecture
x86_64, ppc64le
x86_64, ppc64le, aarch64
## CUDA APIs involved
@ -31,7 +31,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
## Prerequisites
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run
@ -52,9 +52,9 @@ $ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le.
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```

View File

@ -180,16 +180,16 @@
using namespace nvcuda;
__host__ void init_host_matrices(float *a, float *b, float *c) {
__host__ void init_host_matrices(half *a, half *b, float *c) {
for (int i = 0; i < M_GLOBAL; i++) {
for (int j = 0; j < K_GLOBAL; j++) {
a[i * K_GLOBAL + j] = static_cast<float>(rand() % 3);
a[i * K_GLOBAL + j] = (half)(rand() % 3);
}
}
for (int i = 0; i < N_GLOBAL; i++) {
for (int j = 0; j < K_GLOBAL; j++) {
b[i * K_GLOBAL + j] = static_cast<float>(rand() % 3);
b[i * K_GLOBAL + j] = (half)(rand() % 3);
}
}
@ -198,26 +198,6 @@ __host__ void init_host_matrices(float *a, float *b, float *c) {
}
}
__global__ void init_device_matrices(const float *A_h, const float *B_h,
const float *C_h, half *A, half *B,
float *C, float *D) {
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M_GLOBAL * K_GLOBAL;
i += gridDim.x * blockDim.x)
A[i] = __float2half(A_h[i]);
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < N_GLOBAL * K_GLOBAL;
i += gridDim.x * blockDim.x)
B[i] = __float2half(B_h[i]);
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M_GLOBAL * N_GLOBAL;
i += gridDim.x * blockDim.x)
C[i] = C_h[i];
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M_GLOBAL * N_GLOBAL;
i += gridDim.x * blockDim.x)
D[i] = 0;
}
__global__ void compute_gemm(const half *A, const half *B, const float *C,
float *D, float alpha, float beta) {
extern __shared__ half shmem[][CHUNK_K * K + SKEW_HALF];
@ -486,7 +466,7 @@ __global__ void simple_wmma_gemm(half *a, half *b, float *c, float *d, int m_ld,
}
}
__host__ void matMultiplyOnHost(float *A, float *B, float *C, float alpha,
__host__ void matMultiplyOnHost(half *A, half *B, float *C, float alpha,
float beta, int numARows, int numAColumns,
int numBRows, int numBColumns, int numCRows,
int numCColumns) {
@ -495,7 +475,7 @@ __host__ void matMultiplyOnHost(float *A, float *B, float *C, float alpha,
float temp = 0.0;
for (int k = 0; k < numAColumns; k++) {
temp += A[i * numAColumns + k] * B[j * numBRows + k];
temp += (float)A[i * numAColumns + k] * (float)B[j * numBRows + k];
}
C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
@ -514,7 +494,7 @@ int main(int argc, char **argv) {
// Tensor cores require a GPU of Volta (SM7X) architecture or higher.
if (deviceProp.major < 7) {
printf(
"cudaTensorCoreGemm requires requires SM 7.0 or higher to use Tensor "
"cudaTensorCoreGemm requires SM 7.0 or higher to use Tensor "
"Cores. Exiting...\n");
exit(EXIT_WAIVED);
}
@ -523,25 +503,20 @@ int main(int argc, char **argv) {
printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
float *A_h = NULL;
float *B_h = NULL;
half *A_h = NULL;
half *B_h = NULL;
float *C_h = NULL;
#if CPU_DEBUG
float *result_hD = NULL;
float *result_host = NULL;
#endif
checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&A_h),
sizeof(float) * M_GLOBAL * K_GLOBAL));
checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&B_h),
sizeof(float) * K_GLOBAL * N_GLOBAL));
checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&C_h),
sizeof(float) * M_GLOBAL * N_GLOBAL));
A_h = (half *)malloc(sizeof(half) * M_GLOBAL * K_GLOBAL);
B_h = (half *)malloc(sizeof(half) * K_GLOBAL * N_GLOBAL);
C_h = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
#if CPU_DEBUG
checkCudaErrors(cudaMallocManaged((void **)&result_hD,
sizeof(float) * M_GLOBAL * N_GLOBAL));
checkCudaErrors(cudaMallocManaged((void **)&result_host,
sizeof(float) * M_GLOBAL * N_GLOBAL));
result_hD = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
result_host = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
#endif
half *A = NULL;
@ -567,11 +542,13 @@ int main(int argc, char **argv) {
printf("Preparing data for GPU...\n");
checkKernelErrors(
(init_device_matrices<<<deviceProp.multiProcessorCount,
THREADS_PER_BLOCK>>>(A_h, B_h, C_h, A, B, C, D)));
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaMemcpy(A, A_h, sizeof(half) * M_GLOBAL * K_GLOBAL,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(B, B_h, sizeof(half) * N_GLOBAL * K_GLOBAL,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(C, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemset(D, 0, sizeof(float) * M_GLOBAL * N_GLOBAL));
enum {
// Compute the right amount of shared memory to request.
@ -650,6 +627,8 @@ int main(int argc, char **argv) {
printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i],
result_host[i]);
}
free(result_hD);
free(result_host);
#endif
float milliseconds = 0;
@ -662,9 +641,9 @@ int main(int argc, char **argv) {
(milliseconds / 1000.)) /
1e12);
checkCudaErrors(cudaFree(reinterpret_cast<void *>(A_h)));
checkCudaErrors(cudaFree(reinterpret_cast<void *>(B_h)));
checkCudaErrors(cudaFree(reinterpret_cast<void *>(C_h)));
free(A_h);
free(B_h);
free(C_h);
checkCudaErrors(cudaFree(reinterpret_cast<void *>(A)));
checkCudaErrors(cudaFree(reinterpret_cast<void *>(B)));
checkCudaErrors(cudaFree(reinterpret_cast<void *>(C)));

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -246,7 +246,11 @@ LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
SMS ?= 30 35 37 50 52 60 61 70 72 75
else
SMS ?= 30 35 37 50 52 60 61 70 75
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)

View File

@ -39,6 +39,7 @@
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>

View File

@ -10,7 +10,7 @@ CUDA Runtime API, Device Query
## Supported SM Architectures
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
@ -27,7 +27,7 @@ cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion
## Prerequisites
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,318 @@
################################################################################
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
endif
ifeq ($(TARGET_OS),qnx)
CCFLAGS += -DWIN_INTERFACE_CUSTOM
LDFLAGS += -lsocket
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
# This sample is not supported on Mac OSX
ifeq ($(TARGET_OS),darwin)
$(info >>> WARNING - immaTensorCoreGemm is not supported on Mac OSX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
# This sample is not supported on ARMv7
ifeq ($(TARGET_ARCH),armv7l)
$(info >>> WARNING - immaTensorCoreGemm is not supported on ARMv7 - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../Common
LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
SMS ?= 72 75
else
SMS ?= 75
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ALL_CCFLAGS += -maxrregcount=255
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: immaTensorCoreGemm
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
immaTensorCoreGemm.o:immaTensorCoreGemm.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
immaTensorCoreGemm: immaTensorCoreGemm.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./immaTensorCoreGemm
clean:
rm -f immaTensorCoreGemm immaTensorCoreGemm.o
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/immaTensorCoreGemm
clobber: clean

View File

@ -0,0 +1,64 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>immaTensorCoreGemm</name>
<cflags>
<flag>-maxrregcount=255</flag>
</cflags>
<cuda_api_list>
<toolkit>cudaMallocManaged</toolkit>
<toolkit>cudaDeviceSynchronize</toolkit>
<toolkit>cudaFuncSetAttribute</toolkit>
<toolkit>cudaEventCreate</toolkit>
<toolkit>cudaEventRecord</toolkit>
<toolkit>cudaEventSynchronize</toolkit>
<toolkit>cudaEventElapsedTime</toolkit>
<toolkit>cudaFree</toolkit>
</cuda_api_list>
<description><![CDATA[CUDA sample demonstrating a integer GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API for integer introduced in CUDA 10. This sample demonstrates the use of the CUDA WMMA API employing the Tensor Cores introduced in the Volta chip family for faster matrix operations. In addition to that, it demonstrates the use of the new CUDA function attribute cudaFuncAttributeMaxDynamicSharedMemorySize that allows the application to reserve an extended amount of shared memory than it is available by default.]]></description>
<devicecompilation>whole</devicecompilation>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../common/inc</path>
</includepaths>
<keyconcepts>
<concept level="basic">Matrix Multiply</concept>
<concept level="advanced">WMMA</concept>
<concept level="advanced">Tensor Cores</concept>
</keyconcepts>
<keywords>
</keywords>
<libraries>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>immaTensorCoreGemm.cu</primary_file>
<scopes>
<scope>1:CUDA Basic Topics</scope>
</scopes>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<arch>aarch64</arch>
</env>
<env>
<platform>windows7</platform>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<from>7.2</from>
</supported_sm_architectures>
<title>Tensor Core GEMM Integer MMA</title>
<type>exe</type>
</entry>

View File

@ -0,0 +1,70 @@
# immaTensorCoreGemm - Tensor Core GEMM Integer MMA
## Description
CUDA sample demonstrating a integer GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API for integer introduced in CUDA 10. This sample demonstrates the use of the CUDA WMMA API employing the Tensor Cores introduced in the Volta chip family for faster matrix operations. In addition to that, it demonstrates the use of the new CUDA function attribute cudaFuncAttributeMaxDynamicSharedMemorySize that allows the application to reserve an extended amount of shared memory than it is available by default.
## Key Concepts
Matrix Multiply, WMMA, Tensor Cores
## Supported SM Architectures
[SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
Linux, Windows
## Supported CPU Architecture
x86_64, ppc64le, aarch64
## CUDA APIs involved
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, cudaEventRecord, cudaEventSynchronize, cudaEventElapsedTime, cudaFree
## Prerequisites
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
## References (for more details)

View File

@ -0,0 +1,655 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// CUDA sample demonstrating a integer GEMM computation using the Warp Matrix
// Multiply and Accumulate API.
// In this program, the compute_gemm kernel computes the result of a matrix
// multiplication and addition: D = alpha * A * B + beta * C. The dimensions of
// both C and D matrices are M_GLOBAL x N_GLOBAL. The A matrix is M_GLOBAL x
// K_GLOBAL (row-major), the B matrix is K_GLOBAL x N_GLOBAL (column-major). In
// that kernel, each CTA computes one 128 x 128 tile of the resulting matrix per
// iteration. When the tile is computed, the CTA stores it to the global memory
// and begins a new iteration, selecting a new 128 x 128 tile to compute.
// Each CTA consists of eight warps. For the 128 x 128 tile, each warp computes
// eight 16 x 16 subtiles, organized in a 2 x 4 two-dimensional array. Warps
// compute the 16 x 16 subtiles using nvcuda::wmma::mma_sync operations by
// moving through the K_GLOBAL dimension of the A and B matrices and
// accumulating the intermediate result in the local thread state.
// There are a number of simple optimizations used in the algorithm:
// - The CTA copies the 128 x 128 tile of the C matrix from the global memory to
// shared memory. After that is done, each warp loads the C matrix fragments
// from shared memory, thus avoiding a random global memory access.
// - On each internal iteration, the CTA copies a portion of the A and B
// matrices from
// global memory to shared memory. After that, all warps in the CTA reuse the
// A and B data from shared memory, thus reducing the number of data copies
// from global memory.
// - The portions of the A and B matrices are stored in shared memory with an
// additional
// padding (skew) to reduce the number of shared memory access bank conflicts.
// (See a detailed explanation near the SKEW_HALF macro definition.)
// - When the CTA finishes computing the tiles of the resulting matrix, each
// warp stores
// its subtiles to shared memory. The CTA then copies the shared memory
// contents to global memory, again avoiding redundant random global memory
// accesses.
// - Note that the CTA tile size is chosen to maximize the GPU register
// utilization,
// but carefully enough to avoid local memory use.
#include <assert.h>
#include <cuda.h>
#include <mma.h>
#include <stdio.h>
// helper functions and utilities to work with CUDA
#include <helper_cuda.h>
#include <helper_functions.h>
// Externally configurable parameters.
#ifndef CPU_DEBUG
// Set this to 1 to verify the correctness of the GPU-computed matrix.
#define CPU_DEBUG 0
#endif
#ifndef SHARED_MEMORY_LIMIT_64K
// Set this to 0 to use more than 64 Kb of shared memory to cache data, to
// improve the performance of the computations on GPU.
// Note that you need a GPU that can have more than 64 Kb of shared memory
// per multiprocessor.
#define SHARED_MEMORY_LIMIT_64K 1
#endif
// GPU configuration.
#define WARP_SIZE 32
// MMA matrix tile dimensions.
#define M 16
#define N 16
#define K 16
#define WMMA_M 16
#define WMMA_N 16
#define WMMA_K 16
// GEMM configuration.
#define M_TILES 256
#define N_TILES 256
#define K_TILES 256
#define M_GLOBAL (M * M_TILES)
#define N_GLOBAL (N * N_TILES)
#define K_GLOBAL (K * K_TILES)
#define C_LAYOUT wmma::mem_row_major
// Implementation constants.
#define WARPS_PER_BLOCK 8
#define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK)
#if SHARED_MEMORY_LIMIT_64K
// With only 64 Kb shared memory available, we can fit two 8-tile chunks of
// the A and B matrix data, that are 16 * 16 * 8 * 8 * 2 = 32 Kb each
// (i.e. two 8x8 arrays of tiles of 16x16 uint8_t-typed elements per CTA).
// But we cannot account the 8 Kb total skew overhead, without which the
// performance would be severely impacted. So we choose to reduce the chunk size
// in half, i.e. the amount of A and B matrix data we cache in shared memory.
// Accordingly, this doubles the number of outer iterations across the global K
// dimension, which only slightly impacts the performance.
#define CHUNK_K 8
#else
#define CHUNK_K 16
#endif
#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(uint8_t))
#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4))
#define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES)
#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP)
#define BLOCK_ROW_WARPS 2
#define BLOCK_COL_WARPS 4
#define WARP_ROW_TILES 4
#define WARP_COL_TILES 2
#define BLOCK_ROW_TILES (WARP_ROW_TILES * BLOCK_ROW_WARPS)
#define BLOCK_COL_TILES (WARP_COL_TILES * BLOCK_COL_WARPS)
#define GLOBAL_MEM_STRIDE N_GLOBAL
#define SHMEM_STRIDE (N * BLOCK_ROW_TILES)
#define SHMEM_OFFSET (N * WARP_ROW_TILES)
// The macro below is used to shift rows of the A matrix and columns of the B
// matrix in shared memory to minimize possible bank conflicts. Before
// performing the nvcuda::wmma::mma_sync operation, the warp must load the
// matrix data using the nvcuda::wmma::load_matrix_sync operation. Although the
// memory access pattern is not specified for that function, each lane in the
// warp can read one or multiple matrix elements from different matrix rows or
// columns. For shared memory, such access can result in bank conflicts if
// different rows / columns of the matrix map to the same bank. By shifting each
// row and column by a few bytes, we make sure that they map to different banks,
// thus reducing the number of possible bank conflicts. The number of 16
// one-byte "uint8_t" elements is chosen as the minimum possible shift because
// we must keep each row and column 128-bit aligned, as required by
// nvcuda::wmma::load_matrix_sync.
#define SKEW_UINT8 16
#define checkKernelErrors(expr) \
do { \
expr; \
\
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
printf("Line %d: '%s' failed: %s\n", __LINE__, #expr, \
cudaGetErrorString(__err)); \
abort(); \
} \
} while (0)
using namespace nvcuda;
__host__ void init_host_matrices(uint8_t *a, uint8_t *b, int *c) {
for (int i = 0; i < M_GLOBAL; i++) {
for (int j = 0; j < K_GLOBAL; j++) {
a[i * K_GLOBAL + j] = (uint8_t)(rand() % 3);
}
}
for (int i = 0; i < N_GLOBAL; i++) {
for (int j = 0; j < K_GLOBAL; j++) {
b[i * K_GLOBAL + j] = (uint8_t)(rand() % 3);
}
}
for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) {
c[t] = (rand() % 3);
}
}
__global__ void compute_gemm_imma(const uint8_t *A, const uint8_t *B,
const int *C, int *D, int alpha, int beta) {
extern __shared__ uint8_t shmem[][CHUNK_K * K + SKEW_UINT8];
// Warp and lane identification.
const unsigned int warpId = threadIdx.x / WARP_SIZE;
const unsigned int laneId = threadIdx.x % WARP_SIZE;
// Offset in shared memory from which the B matrix is stored.
const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
// This pointer is used to access the C and D matrix tiles this warp computes.
int *shmem_warp_tile_ptr = (int *)&shmem[0][0] +
(warpId / 2) * SHMEM_STRIDE * K * 2 +
(warpId % 2) * SHMEM_OFFSET;
// This pointer is used to stream the C and D matrices block-wide tile to and
// from shared memory.
int *shmem_warp_stream_ptr = (int *)&shmem[0][0] + warpId * SHMEM_STRIDE * K;
// Adjust the beta scaler, as it'll be multiplied by alpha at the end of
// each tile computation. Technically this is not generally correct (may
// result in a loss of precision). Zero still needs to be specially handled
// though.
beta /= alpha;
// Each CTA slides along the 128 x 128 tiles from the top left corner of the
// matrix to the right and down, and selects the next tile to compute. Once
// there's no such tile, all warps in this CTA exit.
for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
const unsigned int block_tile_i =
((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
// Stop when there are no more D matrix tiles to compute in this CTA.
if (block_tile_i >= M_TILES) {
break;
}
// This warp's pointer to the C matrix data to copy memory from to shared
// memory.
const size_t gmem_idx =
(block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
const int *src_gmem_warp_stream_ptr = &C[gmem_idx];
// Stream multiple C tiles to shared memory.
#pragma unroll
for (int i = 0; i < K; i++) {
typedef int4 copy_t;
*((copy_t *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) =
*((copy_t *)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) +
laneId);
}
__syncthreads();
// These fragments will accumulate the result of A and B matrix fragment
// multiplications along the K_GLOBAL dimension.
wmma::fragment<wmma::accumulator, M, N, K, int> c[WARP_COL_TILES]
[WARP_ROW_TILES];
// Load the C matrix tiles into fragments from shared memory.
#pragma unroll
for (int i = 0; i < WARP_COL_TILES; i++) {
#pragma unroll
for (int j = 0; j < WARP_ROW_TILES; j++) {
const int *tile_ptr =
shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
}
}
__syncthreads();
// Scale the C matrix.
#pragma unroll
for (int i = 0; i < WARP_COL_TILES; i++) {
#pragma unroll
for (int j = 0; j < WARP_ROW_TILES; j++) {
#pragma unroll
for (int t = 0; t < c[i][j].num_elements; t++) {
c[i][j].x[t] *= beta;
}
}
}
// Select what warp copies what matrix to shared memory.
// Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
const uint8_t *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] +
M * K_GLOBAL * (warpId % 4) * 2)
: (&B[block_tile_j * N * K_GLOBAL] +
N * K_GLOBAL * (warpId % 4) * 2);
// Go through the global K dimension by a fixed step at a time.
#pragma unroll
for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
// Copy slices of the A and B matrices to shared memory.
// The first half of the warps in the CTA copy the A matrix, the rest copy
// the B matrix.
size_t shmem_idx =
warpId < (WARPS_PER_BLOCK / 2)
? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2)
: (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off);
// First half of the warp copies the first row / column of the matrix,
// the second half of the warp copies the next.
int4 *lane_ptr = (int4 *)(warp_ptr + tile_k * K +
(laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) +
(laneId % CHUNK_COPY_LINE_LANES);
// Shift the second half of the warp to the next row / column in the
// shared memory.
shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
#pragma unroll
for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2;
i++) {
// Copy 16 bytes at once in each lane.
*((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) =
*lane_ptr;
// Advance the global memory pointer and the shared memory index.
lane_ptr = (int4 *)((uint8_t *)lane_ptr +
K_GLOBAL * CHUNK_COPY_LINES_PER_WARP);
shmem_idx += CHUNK_COPY_LINES_PER_WARP;
}
__syncthreads();
// Compute a grid of C matrix tiles in each warp.
#pragma unroll
for (int k_step = 0; k_step < CHUNK_K; k_step++) {
wmma::fragment<wmma::matrix_a, M, N, K, uint8_t, wmma::row_major>
a[WARP_COL_TILES];
wmma::fragment<wmma::matrix_b, M, N, K, uint8_t, wmma::col_major>
b[WARP_ROW_TILES];
#pragma unroll
for (int i = 0; i < WARP_COL_TILES; i++) {
size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M);
const uint8_t *tile_ptr = &shmem[shmem_idx_a][k_step * K];
wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_UINT8);
#pragma unroll
for (int j = 0; j < WARP_ROW_TILES; j++) {
if (i == 0) {
// Load the B matrix fragment once, because it is going to be
// reused against the other A matrix fragments.
size_t shmem_idx_b = shmem_idx_b_off +
(WARP_ROW_TILES * N) * (warpId % 2) +
(j * N);
const uint8_t *tile_ptr = &shmem[shmem_idx_b][k_step * K];
wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_UINT8);
}
wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
}
}
}
__syncthreads();
}
// Store the D fragments to shared memory.
#pragma unroll
for (int i = 0; i < WARP_COL_TILES; i++) {
#pragma unroll
for (int j = 0; j < WARP_ROW_TILES; j++) {
#pragma unroll
// Uniform, point-wise transformations of ALL fragment elements by ALL
// threads in the warp are well-defined even though element indices
// within fragment storage are not defined.
for (int t = 0; t < c[i][j].num_elements; t++) c[i][j].x[t] *= alpha;
int *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
}
}
__syncthreads();
// Now that shared memory contains all the D tiles, stream them to global
// memory.
int *dst_gmem_warp_stream_ptr = &D[gmem_idx];
#pragma unroll
for (int i = 0; i < K; i++) {
*((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
*((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
}
__syncthreads();
}
}
// Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming:
// 1) Matrices are packed in memory.
// 2) M, N and K are multiples of 16.
// 3) Neither A nor B are transposed.
// Note: This is a less performant version of the compute_gemm_imma kernel. It
// is designed for
// demonstration purposes only to show the CUDA WMMA API use without
// relying on availability of the shared memory.
__global__ void simple_wmma_gemm_imma(const uint8_t *a, const uint8_t *b,
const int *c, int *d, int m_ld, int n_ld,
int k_ld, int alpha, int beta) {
// Leading dimensions. Packed with no transpositions.
int lda = m_ld;
int ldb = k_ld;
int ldc = n_ld;
// Tile using a 2D grid
int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
// Declare the fragments
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, uint8_t,
wmma::row_major>
a_frag;
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, uint8_t,
wmma::col_major>
b_frag;
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> acc_frag;
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> c_frag;
wmma::fill_fragment(acc_frag, 0.0f);
// Loop over k
for (int i = 0; i < k_ld; i += WMMA_K) {
int aCol = i;
int aRow = warpM * WMMA_M;
int bCol = i;
int bRow = warpN * WMMA_N;
// Bounds checking
if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {
// Load the inputs
wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda);
wmma::load_matrix_sync(b_frag, b + bCol + bRow * ldb, ldb);
// Perform the matrix multiplication
wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
}
}
// Load in the current value of c, scale it by beta, and add this our result
// scaled by alpha
int cCol = warpN * WMMA_N;
int cRow = warpM * WMMA_M;
if (cRow < m_ld && cCol < n_ld) {
wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc,
wmma::mem_row_major);
for (int i = 0; i < c_frag.num_elements; i++) {
c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
}
// Store the output
wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc,
wmma::mem_row_major);
}
}
__host__ void matMultiplyOnHost(uint8_t *A, uint8_t *B, int *C, int alpha,
int beta, int numARows, int numAColumns,
int numBRows, int numBColumns, int numCRows,
int numCColumns) {
for (int i = 0; i < numCRows; i++) {
for (int j = 0; j < numCColumns; j++) {
int temp = 0;
for (int k = 0; k < numAColumns; k++) {
temp += A[i * numAColumns + k] * B[j * numBRows + k];
}
C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
}
}
}
int main(int argc, char **argv) {
printf("Initializing...\n");
int dev = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
// Tensor cores require a GPU of Volta (SM72) architecture or higher.
if (deviceProp.major < 7 || (deviceProp.major <= 7 && deviceProp.minor < 2)) {
printf(
"immaTensorCoreGemm requires SM 7.2 or higher to use Tensor Cores. "
"Exiting...\n");
exit(EXIT_WAIVED);
}
printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES);
printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
uint8_t *A_h = NULL;
uint8_t *B_h = NULL;
int *C_h = NULL;
#if CPU_DEBUG
int *result_hD = NULL;
int *result_host = NULL;
#endif
A_h = (uint8_t *)malloc(sizeof(uint8_t) * M_GLOBAL * K_GLOBAL);
B_h = (uint8_t *)malloc(sizeof(uint8_t) * K_GLOBAL * N_GLOBAL);
C_h = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL);
#if CPU_DEBUG
result_hD = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL);
result_host = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL);
#endif
uint8_t *A = NULL;
uint8_t *B = NULL;
int *C = NULL;
int *D = NULL;
checkCudaErrors(
cudaMalloc(reinterpret_cast<void **>(&A), sizeof(uint8_t) * M_GLOBAL * K_GLOBAL));
checkCudaErrors(
cudaMalloc(reinterpret_cast<void **>(&B), sizeof(uint8_t) * N_GLOBAL * K_GLOBAL));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&C), sizeof(int) * M_GLOBAL * N_GLOBAL));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&D), sizeof(int) * M_GLOBAL * N_GLOBAL));
assert(((unsigned long long)A) % 128 == 0);
assert(((unsigned long long)B) % 128 == 0);
assert(((unsigned long long)C) % 128 == 0);
assert(((unsigned long long)D) % 128 == 0);
init_host_matrices(A_h, B_h, C_h);
checkCudaErrors(cudaMemcpy(A, A_h, sizeof(uint8_t) * M_GLOBAL * K_GLOBAL,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(B, B_h, sizeof(uint8_t) * N_GLOBAL * K_GLOBAL,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(C, C_h, sizeof(int) * M_GLOBAL * N_GLOBAL,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemset(D, 0, sizeof(int) * M_GLOBAL * N_GLOBAL));
printf("Preparing data for GPU...\n");
assert(((unsigned long long)A) % 128 == 0);
assert(((unsigned long long)B) % 128 == 0);
assert(((unsigned long long)C) % 128 == 0);
assert(((unsigned long long)D) % 128 == 0);
enum {
// Compute the right amount of shared memory to request.
// We need shared memory to hold per-CTA C and D matrix tiles, and to cache
// per-CTA chunks
// of the A and B matrices. Therefore, the right amount to request is the
// maximum of those
// two numbers.
SHMEM_SZ = MAX(sizeof(uint8_t) * (BLOCK_COL_TILES * M) *
(CHUNK_K * K + SKEW_UINT8) * 2,
M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N *
(BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(int))
};
printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL);
int alpha = 1;
int beta = 1;
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaEventRecord(start));
// If enough shared memory available on the GPU use high performant kernel
if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) {
printf("Computing... using high performance kernel compute_gemm_imma \n");
checkCudaErrors(cudaFuncSetAttribute(
compute_gemm_imma, cudaFuncAttributeMaxDynamicSharedMemorySize,
SHMEM_SZ));
checkKernelErrors(
(compute_gemm_imma<<<deviceProp.multiProcessorCount, THREADS_PER_BLOCK,
SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
#if CPU_DEBUG
checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(int) * M_GLOBAL * N_GLOBAL,
cudaMemcpyDeviceToHost));
#endif
} else {
dim3 gridDim;
dim3 blockDim;
// blockDim.x must be a multiple of warpSize
// 128x4 means we have 16 warps and a block computes a 64x64 output tile
blockDim.x = 128;
blockDim.y = 4;
gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) /
(WMMA_M * blockDim.x / 32);
gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y);
printf("Computing... using simple_wmma_gemm_imma kernel\n");
simple_wmma_gemm_imma<<<gridDim, blockDim>>>(A, B, C, D, M_GLOBAL, N_GLOBAL,
K_GLOBAL, alpha, beta);
#if CPU_DEBUG
checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(int) * M_GLOBAL * N_GLOBAL,
cudaMemcpyDeviceToHost));
#endif
}
checkCudaErrors(cudaEventRecord(stop));
checkCudaErrors(cudaEventSynchronize(stop));
#if CPU_DEBUG
printf("Verifying correctness of the computations...\n");
memcpy(result_host, C_h, sizeof(int) * M_GLOBAL * N_GLOBAL);
matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL,
K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL);
for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) {
if (abs(result_hD[i] - result_host[i]) > 0) {
printf("mismatch i=%d result_hD=%d result_host=%d\n", i, result_hD[i],
result_host[i]);
}
}
free(result_host);
free(result_hD);
#endif
float milliseconds = 0;
checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));
printf("Time: %f ms\n", milliseconds);
printf("TOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2)/(milliseconds/1000.)) / 1e12);
free(A_h);
free(B_h);
free(C_h);
checkCudaErrors(cudaFree(reinterpret_cast<void *>(A)));
checkCudaErrors(cudaFree(reinterpret_cast<void *>(B)));
checkCudaErrors(cudaFree(reinterpret_cast<void *>(C)));
checkCudaErrors(cudaFree(reinterpret_cast<void *>(D)));
return EXIT_SUCCESS;
}

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2012
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,107 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>immaTensorCoreGemm_vs2012</RootNamespace>
<ProjectName>immaTensorCoreGemm</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v110</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="immaTensorCoreGemm.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 13.00
# Visual Studio 2013
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,107 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>immaTensorCoreGemm_vs2013</RootNamespace>
<ProjectName>immaTensorCoreGemm</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="immaTensorCoreGemm.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 14.00
# Visual Studio 2015
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,107 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>immaTensorCoreGemm_vs2015</RootNamespace>
<ProjectName>immaTensorCoreGemm</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v140</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="immaTensorCoreGemm.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2017
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,108 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>immaTensorCoreGemm_vs2017</RootNamespace>
<ProjectName>immaTensorCoreGemm</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="immaTensorCoreGemm.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -246,7 +246,11 @@ LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
SMS ?= 30 35 37 50 52 60 61 70 72 75
else
SMS ?= 30 35 37 50 52 60 61 70 75
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)

View File

@ -46,6 +46,7 @@
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>

View File

@ -10,7 +10,7 @@ CUDA Runtime API, Linear Algebra
## Supported SM Architectures
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla
## Prerequisites
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -102,6 +102,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -103,6 +103,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -10,7 +10,7 @@ CUDA Driver API, Matrix Multiply
## Supported SM Architectures
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu
## Prerequisites
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -33,7 +33,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -34,7 +34,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -107,6 +107,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

301
Samples/nvJPEG/Makefile Normal file
View File

@ -0,0 +1,301 @@
################################################################################
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
endif
ifeq ($(TARGET_OS),qnx)
CCFLAGS += -DWIN_INTERFACE_CUSTOM
LDFLAGS += -lsocket
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
# This sample is not supported on Mac OSX
ifeq ($(TARGET_OS),darwin)
$(info >>> WARNING - nvJPEG is not supported on Mac OSX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
# This sample is not supported on ARMv7
ifeq ($(TARGET_ARCH),armv7l)
$(info >>> WARNING - nvJPEG is not supported on ARMv7 - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
# This sample is not supported on aarch64
ifeq ($(TARGET_ARCH),aarch64)
$(info >>> WARNING - nvJPEG is not supported on aarch64 - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../Common
LIBRARIES :=
################################################################################
LIBRARIES += -lnvjpeg
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: nvJPEG
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
nvJPEG.o:nvJPEG.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
nvJPEG: nvJPEG.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./nvJPEG
clean:
rm -f nvJPEG nvJPEG.o
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/nvJPEG
clobber: clean

View File

@ -0,0 +1,58 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>nvJPEG</name>
<description><![CDATA[A CUDA Sample that demonstrates single and batched decoding of jpeg images using NVJPEG Library.]]></description>
<devicecompilation>whole</devicecompilation>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../common/inc</path>
</includepaths>
<keyconcepts>
<concept level="basic">Image Decoding</concept>
<concept level="basic">NVJPEG Library</concept>
</keyconcepts>
<keywords>
<keyword>NVJPEG</keyword>
<keyword>JPEG Decoding</keyword>
</keywords>
<libraries>
<library>nvjpeg</library>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>nvJPEG.cpp</primary_file>
<qatests>
<qatest>-i ../../../../Samples/nvJPEG/images/</qatest>
</qatests>
<required_dependencies>
<dependency>NVJPEG</dependency>
</required_dependencies>
<scopes>
<scope>1:CUDA Basic Topics</scope>
<scope>3:JPEG Decoding</scope>
</scopes>
<sm-arch>sm30</sm-arch>
<sm-arch>sm35</sm-arch>
<sm-arch>sm37</sm-arch>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<from>3.0</from>
</supported_sm_architectures>
<title>NVJPEG simple</title>
<type>exe</type>
</entry>

61
Samples/nvJPEG/README.md Normal file
View File

@ -0,0 +1,61 @@
# nvJPEG - NVJPEG simple
## Description
A CUDA Sample that demonstrates single and batched decoding of jpeg images using NVJPEG Library.
## Key Concepts
Image Decoding, NVJPEG Library
## Supported SM Architectures
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
Linux
## Supported CPU Architecture
x86_64
## CUDA APIs involved
## Dependencies needed to build/run
[NVJPEG](../../README.md#nvjpeg)
## Prerequisites
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
## References (for more details)

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 80 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

559
Samples/nvJPEG/nvJPEG.cpp Normal file
View File

@ -0,0 +1,559 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// This sample needs at least CUDA 10.0. It demonstrates usages of the nvJPEG
// library nvJPEG supports single and multiple image(batched) decode. Multiple
// images can be decoded using the API for batch mode
#include <cuda_runtime_api.h>
#include "nvJPEG_helper.hxx"
int dev_malloc(void **p, size_t s) { return (int)cudaMalloc(p, s); }
int dev_free(void *p) { return (int)cudaFree(p); }
typedef std::vector<std::string> FileNames;
typedef std::vector<std::vector<char> > FileData;
struct decode_params_t {
std::string input_dir;
int batch_size;
int total_images;
int dev;
int warmup;
nvjpegJpegState_t nvjpeg_state;
nvjpegHandle_t nvjpeg_handle;
cudaStream_t stream;
nvjpegOutputFormat_t fmt;
bool write_decoded;
std::string output_dir;
bool pipelined;
bool batched;
};
int read_next_batch(FileNames &image_names, int batch_size,
FileNames::iterator &cur_iter, FileData &raw_data,
std::vector<size_t> &raw_len, FileNames &current_names) {
int counter = 0;
while (counter < batch_size) {
if (cur_iter == image_names.end()) {
std::cerr << "Image list is too short to fill the batch, adding files "
"from the beginning of the image list"
<< std::endl;
cur_iter = image_names.begin();
}
if (image_names.size() == 0) {
std::cerr << "No valid images left in the input list, exit" << std::endl;
return EXIT_FAILURE;
}
// Read an image from disk.
std::ifstream input(cur_iter->c_str(),
std::ios::in | std::ios::binary | std::ios::ate);
if (!(input.is_open())) {
std::cerr << "Cannot open image: " << *cur_iter
<< ", removing it from image list" << std::endl;
image_names.erase(cur_iter);
continue;
}
// Get the size
std::streamsize file_size = input.tellg();
input.seekg(0, std::ios::beg);
// resize if buffer is too small
if (raw_data[counter].size() < file_size) {
raw_data[counter].resize(file_size);
}
if (!input.read(raw_data[counter].data(), file_size)) {
std::cerr << "Cannot read from file: " << *cur_iter
<< ", removing it from image list" << std::endl;
image_names.erase(cur_iter);
continue;
}
raw_len[counter] = file_size;
current_names[counter] = *cur_iter;
counter++;
cur_iter++;
}
return EXIT_SUCCESS;
}
// prepare buffers for RGBi output format
int prepare_buffers(FileData &file_data, std::vector<size_t> &file_len,
std::vector<int> &img_width, std::vector<int> &img_height,
std::vector<nvjpegImage_t> &ibuf,
std::vector<nvjpegImage_t> &isz, FileNames &current_names,
decode_params_t &params) {
int widths[NVJPEG_MAX_COMPONENT];
int heights[NVJPEG_MAX_COMPONENT];
int channels;
nvjpegChromaSubsampling_t subsampling;
for (int i = 0; i < file_data.size(); i++) {
checkCudaErrors(nvjpegGetImageInfo(
params.nvjpeg_handle, (unsigned char *)file_data[i].data(), file_len[i],
&channels, &subsampling, widths, heights));
img_width[i] = widths[0];
img_height[i] = heights[0];
std::cout << "Processing: " << current_names[i] << std::endl;
std::cout << "Image is " << channels << " channels." << std::endl;
for (int c = 0; c < channels; c++) {
std::cout << "Channel #" << c << " size: " << widths[c] << " x "
<< heights[c] << std::endl;
}
switch (subsampling) {
case NVJPEG_CSS_444:
std::cout << "YUV 4:4:4 chroma subsampling" << std::endl;
break;
case NVJPEG_CSS_440:
std::cout << "YUV 4:4:0 chroma subsampling" << std::endl;
break;
case NVJPEG_CSS_422:
std::cout << "YUV 4:2:2 chroma subsampling" << std::endl;
break;
case NVJPEG_CSS_420:
std::cout << "YUV 4:2:0 chroma subsampling" << std::endl;
break;
case NVJPEG_CSS_411:
std::cout << "YUV 4:1:1 chroma subsampling" << std::endl;
break;
case NVJPEG_CSS_410:
std::cout << "YUV 4:1:0 chroma subsampling" << std::endl;
break;
case NVJPEG_CSS_GRAY:
std::cout << "Grayscale JPEG " << std::endl;
break;
case NVJPEG_CSS_UNKNOWN:
std::cout << "Unknown chroma subsampling" << std::endl;
return EXIT_FAILURE;
}
int mul = 1;
// in the case of interleaved RGB output, write only to single channel, but
// 3 samples at once
if (params.fmt == NVJPEG_OUTPUT_RGBI || params.fmt == NVJPEG_OUTPUT_BGRI) {
channels = 1;
mul = 3;
}
// in the case of rgb create 3 buffers with sizes of original image
else if (params.fmt == NVJPEG_OUTPUT_RGB ||
params.fmt == NVJPEG_OUTPUT_BGR) {
channels = 3;
widths[1] = widths[2] = widths[0];
heights[1] = heights[2] = heights[0];
}
// realloc output buffer if required
for (int c = 0; c < channels; c++) {
int aw = mul * widths[c];
int ah = heights[c];
int sz = aw * ah;
ibuf[i].pitch[c] = aw;
if (sz > isz[i].pitch[c]) {
if (ibuf[i].channel[c]) {
checkCudaErrors(cudaFree(ibuf[i].channel[c]));
}
checkCudaErrors(cudaMalloc(&ibuf[i].channel[c], sz));
isz[i].pitch[c] = sz;
}
}
}
return EXIT_SUCCESS;
}
void release_buffers(std::vector<nvjpegImage_t> &ibuf) {
for (int i = 0; i < ibuf.size(); i++) {
for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++)
if (ibuf[i].channel[c]) checkCudaErrors(cudaFree(ibuf[i].channel[c]));
}
}
int decode_images(const FileData &img_data, const std::vector<size_t> &img_len,
std::vector<nvjpegImage_t> &out, decode_params_t &params,
double &time) {
checkCudaErrors(cudaStreamSynchronize(params.stream));
nvjpegStatus_t err;
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
if (!params.batched) {
if (!params.pipelined) // decode one image at a time
{
int thread_idx = 0;
sdkStartTimer(&timer);
for (int i = 0; i < params.batch_size; i++) {
checkCudaErrors(nvjpegDecode(params.nvjpeg_handle, params.nvjpeg_state,
(const unsigned char *)img_data[i].data(),
img_len[i], params.fmt, &out[i],
params.stream));
checkCudaErrors(cudaStreamSynchronize(params.stream));
}
} else {
int thread_idx = 0;
sdkStartTimer(&timer);
for (int i = 0; i < params.batch_size; i++) {
checkCudaErrors(
nvjpegDecodePhaseOne(params.nvjpeg_handle, params.nvjpeg_state,
(const unsigned char *)img_data[i].data(),
img_len[i], params.fmt, params.stream));
checkCudaErrors(cudaStreamSynchronize(params.stream));
checkCudaErrors(nvjpegDecodePhaseTwo(
params.nvjpeg_handle, params.nvjpeg_state, params.stream));
checkCudaErrors(nvjpegDecodePhaseThree(
params.nvjpeg_handle, params.nvjpeg_state, &out[i], params.stream));
}
checkCudaErrors(cudaStreamSynchronize(params.stream));
}
} else {
std::vector<const unsigned char *> raw_inputs;
for (int i = 0; i < params.batch_size; i++) {
raw_inputs.push_back((const unsigned char *)img_data[i].data());
}
if (!params.pipelined) // decode multiple images in a single batch
{
sdkStartTimer(&timer);
checkCudaErrors(nvjpegDecodeBatched(
params.nvjpeg_handle, params.nvjpeg_state, raw_inputs.data(),
img_len.data(), out.data(), params.stream));
checkCudaErrors(cudaStreamSynchronize(params.stream));
} else {
int thread_idx = 0;
for (int i = 0; i < params.batch_size; i++) {
checkCudaErrors(nvjpegDecodeBatchedPhaseOne(
params.nvjpeg_handle, params.nvjpeg_state, raw_inputs[i],
img_len[i], i, thread_idx, params.stream));
}
checkCudaErrors(nvjpegDecodeBatchedPhaseTwo(
params.nvjpeg_handle, params.nvjpeg_state, params.stream));
checkCudaErrors(nvjpegDecodeBatchedPhaseThree(params.nvjpeg_handle,
params.nvjpeg_state,
out.data(), params.stream));
checkCudaErrors(cudaStreamSynchronize(params.stream));
}
}
sdkStopTimer(&timer);
time = sdkGetAverageTimerValue(&timer)/1000.0f;
return EXIT_SUCCESS;
}
int write_images(std::vector<nvjpegImage_t> &iout, std::vector<int> &widths,
std::vector<int> &heights, decode_params_t &params,
FileNames &filenames) {
for (int i = 0; i < params.batch_size; i++) {
// Get the file name, without extension.
// This will be used to rename the output file.
size_t position = filenames[i].rfind("/");
std::string sFileName =
(std::string::npos == position)
? filenames[i]
: filenames[i].substr(position + 1, filenames[i].size());
position = sFileName.rfind(".");
sFileName = (std::string::npos == position) ? sFileName
: sFileName.substr(0, position);
std::string fname(params.output_dir + "/" + sFileName + ".bmp");
int err;
if (params.fmt == NVJPEG_OUTPUT_RGB || params.fmt == NVJPEG_OUTPUT_BGR) {
err = writeBMP(fname.c_str(), iout[i].channel[0], iout[i].pitch[0],
iout[i].channel[1], iout[i].pitch[1], iout[i].channel[2],
iout[i].pitch[2], widths[i], heights[i]);
} else if (params.fmt == NVJPEG_OUTPUT_RGBI ||
params.fmt == NVJPEG_OUTPUT_BGRI) {
// Write BMP from interleaved data
err = writeBMPi(fname.c_str(), iout[i].channel[0], iout[i].pitch[0],
widths[i], heights[i]);
}
if (err) {
std::cout << "Cannot write output file: " << fname << std::endl;
return EXIT_FAILURE;
}
std::cout << "Done writing decoded image to file: " << fname << std::endl;
}
}
double process_images(FileNames &image_names, decode_params_t &params,
double &total) {
// vector for storing raw files and file lengths
FileData file_data(params.batch_size);
std::vector<size_t> file_len(params.batch_size);
FileNames current_names(params.batch_size);
std::vector<int> widths(params.batch_size);
std::vector<int> heights(params.batch_size);
// we wrap over image files to process total_images of files
FileNames::iterator file_iter = image_names.begin();
// stream for decoding
checkCudaErrors(
cudaStreamCreateWithFlags(&params.stream, cudaStreamNonBlocking));
int total_processed = 0;
// output buffers
std::vector<nvjpegImage_t> iout(params.batch_size);
// output buffer sizes, for convenience
std::vector<nvjpegImage_t> isz(params.batch_size);
for (int i = 0; i < iout.size(); i++) {
for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) {
iout[i].channel[c] = NULL;
iout[i].pitch[c] = 0;
isz[i].pitch[c] = 0;
}
}
double test_time = 0;
int warmup = 0;
while (total_processed < params.total_images) {
if (read_next_batch(image_names, params.batch_size, file_iter, file_data,
file_len, current_names))
return EXIT_FAILURE;
if (prepare_buffers(file_data, file_len, widths, heights, iout, isz,
current_names, params))
return EXIT_FAILURE;
double time;
if (decode_images(file_data, file_len, iout, params, time))
return EXIT_FAILURE;
if (warmup < params.warmup) {
warmup++;
} else {
total_processed += params.batch_size;
test_time += time;
}
if (params.write_decoded)
write_images(iout, widths, heights, params, current_names);
}
total = test_time;
release_buffers(iout);
checkCudaErrors(cudaStreamDestroy(params.stream));
return EXIT_SUCCESS;
}
// parse parameters
int findParamIndex(const char **argv, int argc, const char *parm) {
int count = 0;
int index = -1;
for (int i = 0; i < argc; i++) {
if (strncmp(argv[i], parm, 100) == 0) {
index = i;
count++;
}
}
if (count == 0 || count == 1) {
return index;
} else {
std::cout << "Error, parameter " << parm
<< " has been specified more than once, exiting\n"
<< std::endl;
return -1;
}
return -1;
}
int main(int argc, const char *argv[]) {
int pidx;
if ((pidx = findParamIndex(argv, argc, "-h")) != -1 ||
(pidx = findParamIndex(argv, argc, "--help")) != -1) {
std::cout << "Usage: " << argv[0]
<< " -i images_dir [-b batch_size] [-t total_images] [-device= "
"device_id] [-w warmup_iterations] [-o output_dir] "
"[-pipelined] [-batched] [-fmt output_format]\n";
std::cout << "Parameters: " << std::endl;
std::cout << "\timages_dir\t:\tPath to single image or directory of images"
<< std::endl;
std::cout << "\tbatch_size\t:\tDecode images from input by batches of "
"specified size"
<< std::endl;
std::cout << "\ttotal_images\t:\tDecode this much images, if there are "
"less images \n"
<< "\t\t\t\t\tin the input than total images, decoder will loop "
"over the input"
<< std::endl;
std::cout << "\tdevice_id\t:\tWhich device to use for decoding"
<< std::endl;
std::cout << "\twarmup_iterations\t:\tRun this amount of batches first "
"without measuring performance"
<< std::endl;
std::cout
<< "\toutput_dir\t:\tWrite decoded images as BMPs to this directory"
<< std::endl;
std::cout << "\tpipelined\t:\tUse decoding in phases" << std::endl;
std::cout << "\tbatched\t\t:\tUse batched interface" << std::endl;
std::cout << "\toutput_format\t:\tnvJPEG output format for decoding. One "
"of [rgb, rgbi, bgr, bgri, yuv, y, unchanged]"
<< std::endl;
return EXIT_SUCCESS;
}
decode_params_t params;
params.input_dir = "./";
if ((pidx = findParamIndex(argv, argc, "-i")) != -1) {
params.input_dir = argv[pidx + 1];
} else {
std::cerr << "Please specify input directory with encoded images"
<< std::endl;
return EXIT_WAIVED;
}
params.batch_size = 1;
if ((pidx = findParamIndex(argv, argc, "-b")) != -1) {
params.batch_size = std::atoi(argv[pidx + 1]);
}
params.total_images = -1;
if ((pidx = findParamIndex(argv, argc, "-t")) != -1) {
params.total_images = std::atoi(argv[pidx + 1]);
}
params.dev = 0;
params.dev = findCudaDevice(argc, argv);
params.warmup = 0;
if ((pidx = findParamIndex(argv, argc, "-w")) != -1) {
params.warmup = std::atoi(argv[pidx + 1]);
}
params.batched = false;
if ((pidx = findParamIndex(argv, argc, "-batched")) != -1) {
params.batched = true;
}
params.pipelined = false;
if ((pidx = findParamIndex(argv, argc, "-pipelined")) != -1) {
params.pipelined = true;
}
params.fmt = NVJPEG_OUTPUT_RGB;
if ((pidx = findParamIndex(argv, argc, "-fmt")) != -1) {
std::string sfmt = argv[pidx + 1];
if (sfmt == "rgb")
params.fmt = NVJPEG_OUTPUT_RGB;
else if (sfmt == "bgr")
params.fmt = NVJPEG_OUTPUT_BGR;
else if (sfmt == "rgbi")
params.fmt = NVJPEG_OUTPUT_RGBI;
else if (sfmt == "bgri")
params.fmt = NVJPEG_OUTPUT_BGRI;
else if (sfmt == "yuv")
params.fmt = NVJPEG_OUTPUT_YUV;
else if (sfmt == "y")
params.fmt = NVJPEG_OUTPUT_Y;
else if (sfmt == "unchanged")
params.fmt = NVJPEG_OUTPUT_UNCHANGED;
else {
std::cout << "Unknown format: " << sfmt << std::endl;
return EXIT_FAILURE;
}
}
params.write_decoded = false;
if ((pidx = findParamIndex(argv, argc, "-o")) != -1) {
params.output_dir = argv[pidx + 1];
if (params.fmt != NVJPEG_OUTPUT_RGB && params.fmt != NVJPEG_OUTPUT_BGR &&
params.fmt != NVJPEG_OUTPUT_RGBI && params.fmt != NVJPEG_OUTPUT_BGRI) {
std::cout << "We can write ony BMPs, which require output format be "
"either RGB/BGR or RGBi/BGRi"
<< std::endl;
return EXIT_FAILURE;
}
params.write_decoded = true;
}
cudaDeviceProp props;
checkCudaErrors(cudaGetDeviceProperties(&props, params.dev));
printf("Using GPU %d (%s, %d SMs, %d th/SM max, CC %d.%d, ECC %s)\n",
params.dev, props.name, props.multiProcessorCount,
props.maxThreadsPerMultiProcessor, props.major, props.minor,
props.ECCEnabled ? "on" : "off");
nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free};
checkCudaErrors(nvjpegCreate(NVJPEG_BACKEND_DEFAULT, &dev_allocator,
&params.nvjpeg_handle));
checkCudaErrors(
nvjpegJpegStateCreate(params.nvjpeg_handle, &params.nvjpeg_state));
checkCudaErrors(
nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state,
params.batch_size, 1, params.fmt));
// read source images
FileNames image_names;
readInput(params.input_dir, image_names);
if (params.total_images == -1) {
params.total_images = image_names.size();
} else if (params.total_images % params.batch_size) {
params.total_images =
((params.total_images) / params.batch_size) * params.batch_size;
std::cout << "Changing total_images number to " << params.total_images
<< " to be multiple of batch_size - " << params.batch_size
<< std::endl;
}
std::cout << "Decoding images in directory: " << params.input_dir
<< ", total " << params.total_images << ", batchsize "
<< params.batch_size << std::endl;
double total;
if (process_images(image_names, params, total)) return EXIT_FAILURE;
std::cout << "Total decoding time: " << total << std::endl;
std::cout << "Avg decoding time per image: " << total / params.total_images
<< std::endl;
std::cout << "Avg images per sec: " << params.total_images / total
<< std::endl;
std::cout << "Avg decoding time per batch: "
<< total / ((params.total_images + params.batch_size - 1) /
params.batch_size)
<< std::endl;
checkCudaErrors(nvjpegJpegStateDestroy(params.nvjpeg_state));
checkCudaErrors(nvjpegDestroy(params.nvjpeg_handle));
return EXIT_SUCCESS;
}

View File

@ -0,0 +1,338 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// This sample needs at least CUDA 10.0.
// It demonstrates usages of the nvJPEG library
#ifndef NV_JPEG_EXAMPLE
#define NV_JPEG_EXAMPLE
#include "cuda_runtime.h"
#include "nvjpeg.h"
#include "helper_cuda.h"
#include "helper_timer.h"
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include <string.h> // strcmpi
#include <sys/time.h> // timings
#include <dirent.h> // linux dir traverse
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
// write bmp, input - RGB, device
int writeBMP(const char *filename, const unsigned char *d_chanR, int pitchR,
const unsigned char *d_chanG, int pitchG,
const unsigned char *d_chanB, int pitchB, int width, int height) {
unsigned int headers[13];
FILE *outfile;
int extrabytes;
int paddedsize;
int x;
int y;
int n;
int red, green, blue;
std::vector<unsigned char> vchanR(height * width);
std::vector<unsigned char> vchanG(height * width);
std::vector<unsigned char> vchanB(height * width);
unsigned char *chanR = vchanR.data();
unsigned char *chanG = vchanG.data();
unsigned char *chanB = vchanB.data();
checkCudaErrors(cudaMemcpy2D(chanR, (size_t)width, d_chanR, (size_t)pitchR,
width, height, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy2D(chanG, (size_t)width, d_chanG, (size_t)pitchR,
width, height, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy2D(chanB, (size_t)width, d_chanB, (size_t)pitchR,
width, height, cudaMemcpyDeviceToHost));
extrabytes =
4 - ((width * 3) % 4); // How many bytes of padding to add to each
// horizontal line - the size of which must
// be a multiple of 4 bytes.
if (extrabytes == 4) extrabytes = 0;
paddedsize = ((width * 3) + extrabytes) * height;
// Headers...
// Note that the "BM" identifier in bytes 0 and 1 is NOT included in these
// "headers".
headers[0] = paddedsize + 54; // bfSize (whole file size)
headers[1] = 0; // bfReserved (both)
headers[2] = 54; // bfOffbits
headers[3] = 40; // biSize
headers[4] = width; // biWidth
headers[5] = height; // biHeight
// Would have biPlanes and biBitCount in position 6, but they're shorts.
// It's easier to write them out separately (see below) than pretend
// they're a single int, especially with endian issues...
headers[7] = 0; // biCompression
headers[8] = paddedsize; // biSizeImage
headers[9] = 0; // biXPelsPerMeter
headers[10] = 0; // biYPelsPerMeter
headers[11] = 0; // biClrUsed
headers[12] = 0; // biClrImportant
if (!(outfile = fopen(filename, "wb"))) {
std::cerr << "Cannot open file: " << filename << std::endl;
return 1;
}
//
// Headers begin...
// When printing ints and shorts, we write out 1 character at a time to avoid
// endian issues.
//
fprintf(outfile, "BM");
for (n = 0; n <= 5; n++) {
fprintf(outfile, "%c", headers[n] & 0x000000FF);
fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
}
// These next 4 characters are for the biPlanes and biBitCount fields.
fprintf(outfile, "%c", 1);
fprintf(outfile, "%c", 0);
fprintf(outfile, "%c", 24);
fprintf(outfile, "%c", 0);
for (n = 7; n <= 12; n++) {
fprintf(outfile, "%c", headers[n] & 0x000000FF);
fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
}
//
// Headers done, now write the data...
//
for (y = height - 1; y >= 0;
y--) // BMP image format is written from bottom to top...
{
for (x = 0; x <= width - 1; x++) {
red = chanR[y * width + x];
green = chanG[y * width + x];
blue = chanB[y * width + x];
if (red > 255) red = 255;
if (red < 0) red = 0;
if (green > 255) green = 255;
if (green < 0) green = 0;
if (blue > 255) blue = 255;
if (blue < 0) blue = 0;
// Also, it's written in (b,g,r) format...
fprintf(outfile, "%c", blue);
fprintf(outfile, "%c", green);
fprintf(outfile, "%c", red);
}
if (extrabytes) // See above - BMP lines must be of lengths divisible by 4.
{
for (n = 1; n <= extrabytes; n++) {
fprintf(outfile, "%c", 0);
}
}
}
fclose(outfile);
return 0;
}
// write bmp, input - RGB, device
int writeBMPi(const char *filename, const unsigned char *d_RGB, int pitch,
int width, int height) {
unsigned int headers[13];
FILE *outfile;
int extrabytes;
int paddedsize;
int x;
int y;
int n;
int red, green, blue;
std::vector<unsigned char> vchanRGB(height * width * 3);
unsigned char *chanRGB = vchanRGB.data();
checkCudaErrors(cudaMemcpy2D(chanRGB, (size_t)width * 3, d_RGB, (size_t)pitch,
width * 3, height, cudaMemcpyDeviceToHost));
extrabytes =
4 - ((width * 3) % 4); // How many bytes of padding to add to each
// horizontal line - the size of which must
// be a multiple of 4 bytes.
if (extrabytes == 4) extrabytes = 0;
paddedsize = ((width * 3) + extrabytes) * height;
// Headers...
// Note that the "BM" identifier in bytes 0 and 1 is NOT included in these
// "headers".
headers[0] = paddedsize + 54; // bfSize (whole file size)
headers[1] = 0; // bfReserved (both)
headers[2] = 54; // bfOffbits
headers[3] = 40; // biSize
headers[4] = width; // biWidth
headers[5] = height; // biHeight
// Would have biPlanes and biBitCount in position 6, but they're shorts.
// It's easier to write them out separately (see below) than pretend
// they're a single int, especially with endian issues...
headers[7] = 0; // biCompression
headers[8] = paddedsize; // biSizeImage
headers[9] = 0; // biXPelsPerMeter
headers[10] = 0; // biYPelsPerMeter
headers[11] = 0; // biClrUsed
headers[12] = 0; // biClrImportant
if (!(outfile = fopen(filename, "wb"))) {
std::cerr << "Cannot open file: " << filename << std::endl;
return 1;
}
//
// Headers begin...
// When printing ints and shorts, we write out 1 character at a time to avoid
// endian issues.
//
fprintf(outfile, "BM");
for (n = 0; n <= 5; n++) {
fprintf(outfile, "%c", headers[n] & 0x000000FF);
fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
}
// These next 4 characters are for the biPlanes and biBitCount fields.
fprintf(outfile, "%c", 1);
fprintf(outfile, "%c", 0);
fprintf(outfile, "%c", 24);
fprintf(outfile, "%c", 0);
for (n = 7; n <= 12; n++) {
fprintf(outfile, "%c", headers[n] & 0x000000FF);
fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
}
//
// Headers done, now write the data...
//
for (y = height - 1; y >= 0;
y--) // BMP image format is written from bottom to top...
{
for (x = 0; x <= width - 1; x++) {
red = chanRGB[(y * width + x) * 3];
green = chanRGB[(y * width + x) * 3 + 1];
blue = chanRGB[(y * width + x) * 3 + 2];
if (red > 255) red = 255;
if (red < 0) red = 0;
if (green > 255) green = 255;
if (green < 0) green = 0;
if (blue > 255) blue = 255;
if (blue < 0) blue = 0;
// Also, it's written in (b,g,r) format...
fprintf(outfile, "%c", blue);
fprintf(outfile, "%c", green);
fprintf(outfile, "%c", red);
}
if (extrabytes) // See above - BMP lines must be of lengths divisible by 4.
{
for (n = 1; n <= extrabytes; n++) {
fprintf(outfile, "%c", 0);
}
}
}
fclose(outfile);
return 0;
}
int readInput(const std::string &sInputPath,
std::vector<std::string> &filelist) {
int error_code = 1;
struct stat s;
if (stat(sInputPath.c_str(), &s) == 0) {
if (s.st_mode & S_IFREG) {
filelist.push_back(sInputPath);
} else if (s.st_mode & S_IFDIR) {
// processing each file in directory
DIR *dir_handle;
struct dirent *dir;
dir_handle = opendir(sInputPath.c_str());
std::vector<std::string> filenames;
if (dir_handle) {
error_code = 0;
while ((dir = readdir(dir_handle)) != NULL) {
if (dir->d_type == DT_REG) {
std::string sFileName = sInputPath + dir->d_name;
filelist.push_back(sFileName);
} else if (dir->d_type == DT_DIR) {
std::string sname = dir->d_name;
if (sname != "." && sname != "..") {
readInput(sInputPath + sname + "/", filelist);
}
}
}
closedir(dir_handle);
} else {
std::cout << "Cannot open input directory: " << sInputPath << std::endl;
return error_code;
}
} else {
std::cout << "Cannot open input: " << sInputPath << std::endl;
return error_code;
}
} else {
std::cout << "Cannot find input path " << sInputPath << std::endl;
return error_code;
}
return 0;
}
#endif

View File

@ -246,7 +246,11 @@ LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
SMS ?= 30 35 37 50 52 60 61 70 72 75
else
SMS ?= 30 35 37 50 52 60 61 70 75
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)

View File

@ -48,6 +48,7 @@
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>

View File

@ -10,7 +10,7 @@ Performance Strategies, Asynchronous Data Transfers, Unified Virtual Address Spa
## Supported SM Architectures
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
@ -27,7 +27,7 @@ cudaDeviceCanAccessPeer, cudaDeviceEnablePeerAccess, cudaDeviceDisablePeerAccess
## Prerequisites
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run

Some files were not shown because too many files have changed in this diff Show More