Add and update samples with CUDA 10.1 support
|
@ -282,6 +282,42 @@ static const char *_cudaGetErrorEnum(curandStatus_t error) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef NVJPEGAPI
|
||||||
|
// nvJPEG API errors
|
||||||
|
static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
|
||||||
|
switch (error) {
|
||||||
|
case NVJPEG_STATUS_SUCCESS:
|
||||||
|
return "NVJPEG_STATUS_SUCCESS";
|
||||||
|
|
||||||
|
case NVJPEG_STATUS_NOT_INITIALIZED:
|
||||||
|
return "NVJPEG_STATUS_NOT_INITIALIZED";
|
||||||
|
|
||||||
|
case NVJPEG_STATUS_INVALID_PARAMETER:
|
||||||
|
return "NVJPEG_STATUS_INVALID_PARAMETER";
|
||||||
|
|
||||||
|
case NVJPEG_STATUS_BAD_JPEG:
|
||||||
|
return "NVJPEG_STATUS_BAD_JPEG";
|
||||||
|
|
||||||
|
case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
|
||||||
|
return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
|
||||||
|
|
||||||
|
case NVJPEG_STATUS_ALLOCATOR_FAILURE:
|
||||||
|
return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
|
||||||
|
|
||||||
|
case NVJPEG_STATUS_EXECUTION_FAILED:
|
||||||
|
return "NVJPEG_STATUS_EXECUTION_FAILED";
|
||||||
|
|
||||||
|
case NVJPEG_STATUS_ARCH_MISMATCH:
|
||||||
|
return "NVJPEG_STATUS_ARCH_MISMATCH";
|
||||||
|
|
||||||
|
case NVJPEG_STATUS_INTERNAL_ERROR:
|
||||||
|
return "NVJPEG_STATUS_INTERNAL_ERROR";
|
||||||
|
}
|
||||||
|
|
||||||
|
return "<unknown>";
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef NV_NPPIDEFS_H
|
#ifdef NV_NPPIDEFS_H
|
||||||
// NPP API errors
|
// NPP API errors
|
||||||
static const char *_cudaGetErrorEnum(NppStatus error) {
|
static const char *_cudaGetErrorEnum(NppStatus error) {
|
||||||
|
|
178
Common/helper_multiprocess.cpp
Normal file
|
@ -0,0 +1,178 @@
|
||||||
|
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived
|
||||||
|
* from this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "helper_multiprocess.h"
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info)
|
||||||
|
{
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
info->size = sz;
|
||||||
|
info->shmHandle = CreateFileMapping(INVALID_HANDLE_VALUE,
|
||||||
|
NULL,
|
||||||
|
PAGE_READWRITE,
|
||||||
|
0,
|
||||||
|
(DWORD)sz,
|
||||||
|
name);
|
||||||
|
if (info->shmHandle == 0) {
|
||||||
|
return GetLastError();
|
||||||
|
}
|
||||||
|
|
||||||
|
info->addr = MapViewOfFile(info->shmHandle, FILE_MAP_ALL_ACCESS, 0, 0, sz);
|
||||||
|
if (info->addr == NULL) {
|
||||||
|
return GetLastError();
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
#else
|
||||||
|
int status = 0;
|
||||||
|
|
||||||
|
info->size = sz;
|
||||||
|
|
||||||
|
info->shmFd = shm_open(name, O_RDWR | O_CREAT, 0777);
|
||||||
|
if (info->shmFd < 0) {
|
||||||
|
return errno;
|
||||||
|
}
|
||||||
|
|
||||||
|
status = ftruncate(info->shmFd, sz);
|
||||||
|
if (status != 0) {
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0);
|
||||||
|
if (info->addr == NULL) {
|
||||||
|
return errno;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info)
|
||||||
|
{
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
info->size = sz;
|
||||||
|
|
||||||
|
info->shmHandle = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, name);
|
||||||
|
if (info->shmHandle == 0) {
|
||||||
|
return GetLastError();
|
||||||
|
}
|
||||||
|
|
||||||
|
info->addr = MapViewOfFile(info->shmHandle, FILE_MAP_ALL_ACCESS, 0, 0, sz);
|
||||||
|
if (info->addr == NULL) {
|
||||||
|
return GetLastError();
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
#else
|
||||||
|
info->size = sz;
|
||||||
|
|
||||||
|
info->shmFd = shm_open(name, O_RDWR, 0777);
|
||||||
|
if (info->shmFd < 0) {
|
||||||
|
return errno;
|
||||||
|
}
|
||||||
|
|
||||||
|
info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0);
|
||||||
|
if (info->addr == NULL) {
|
||||||
|
return errno;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void sharedMemoryClose(sharedMemoryInfo *info)
|
||||||
|
{
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
if (info->addr) {
|
||||||
|
UnmapViewOfFile(info->addr);
|
||||||
|
}
|
||||||
|
if (info->shmHandle) {
|
||||||
|
CloseHandle(info->shmHandle);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (info->addr) {
|
||||||
|
munmap(info->addr, info->size);
|
||||||
|
}
|
||||||
|
if (info->shmFd) {
|
||||||
|
close(info->shmFd);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int spawnProcess(Process *process, const char *app, char * const *args)
|
||||||
|
{
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
STARTUPINFO si = {0};
|
||||||
|
BOOL status;
|
||||||
|
size_t arglen = 0;
|
||||||
|
size_t argIdx = 0;
|
||||||
|
std::string arg_string;
|
||||||
|
memset(process, 0, sizeof(*process));
|
||||||
|
|
||||||
|
while (*args) {
|
||||||
|
arg_string.append(*args).append(1, ' ');
|
||||||
|
args++;
|
||||||
|
}
|
||||||
|
|
||||||
|
status = CreateProcess(app, LPSTR(arg_string.c_str()), NULL, NULL, FALSE, 0, NULL, NULL, &si, process);
|
||||||
|
|
||||||
|
return status ? 0 : GetLastError();
|
||||||
|
#else
|
||||||
|
*process = fork();
|
||||||
|
if (*process == 0) {
|
||||||
|
if (0 > execvp(app, args)) {
|
||||||
|
return errno;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (*process < 0) {
|
||||||
|
return errno;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int waitProcess(Process *process)
|
||||||
|
{
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
DWORD exitCode;
|
||||||
|
WaitForSingleObject(process->hProcess, INFINITE);
|
||||||
|
GetExitCodeProcess(process->hProcess, &exitCode);
|
||||||
|
CloseHandle(process->hProcess);
|
||||||
|
CloseHandle(process->hThread);
|
||||||
|
return (int)exitCode;
|
||||||
|
#else
|
||||||
|
int status = 0;
|
||||||
|
do {
|
||||||
|
if (0 > waitpid(*process, &status, 0)) {
|
||||||
|
return errno;
|
||||||
|
}
|
||||||
|
} while (!WIFEXITED(status));
|
||||||
|
return WEXITSTATUS(status);
|
||||||
|
#endif
|
||||||
|
}
|
71
Common/helper_multiprocess.h
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived
|
||||||
|
* from this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef HELPER_MULTIPROCESS_H
|
||||||
|
#define HELPER_MULTIPROCESS_H
|
||||||
|
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
#ifndef WIN32_LEAN_AND_MEAN
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
#else
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <sys/wait.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef struct sharedMemoryInfo_st {
|
||||||
|
void *addr;
|
||||||
|
size_t size;
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
HANDLE shmHandle;
|
||||||
|
#else
|
||||||
|
int shmFd;
|
||||||
|
#endif
|
||||||
|
} sharedMemoryInfo;
|
||||||
|
|
||||||
|
int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info);
|
||||||
|
|
||||||
|
int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info);
|
||||||
|
|
||||||
|
void sharedMemoryClose(sharedMemoryInfo *info);
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
typedef PROCESS_INFORMATION Process;
|
||||||
|
#else
|
||||||
|
typedef pid_t Process;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int spawnProcess(Process *process, const char *app, char * const *args);
|
||||||
|
|
||||||
|
int waitProcess(Process *process);
|
||||||
|
|
||||||
|
#endif // HELPER_MULTIPROCESS_H
|
|
@ -234,6 +234,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||||
|
|
||||||
SAMPLE_ENABLED := 1
|
SAMPLE_ENABLED := 1
|
||||||
|
|
||||||
|
# This sample is not supported on QNX
|
||||||
|
ifeq ($(TARGET_OS),qnx)
|
||||||
|
$(info >>> WARNING - UnifiedMemoryPerf is not supported on QNX - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
ALL_LDFLAGS :=
|
ALL_LDFLAGS :=
|
||||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||||
|
@ -246,7 +252,11 @@ LIBRARIES :=
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
# Gencode arguments
|
# Gencode arguments
|
||||||
|
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
|
||||||
|
SMS ?= 30 35 37 50 52 60 61 70 72 75
|
||||||
|
else
|
||||||
SMS ?= 30 35 37 50 52 60 61 70 75
|
SMS ?= 30 35 37 50 52 60 61 70 75
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(SMS),)
|
ifeq ($(SMS),)
|
||||||
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||||
|
|
|
@ -52,6 +52,7 @@
|
||||||
<sm-arch>sm60</sm-arch>
|
<sm-arch>sm60</sm-arch>
|
||||||
<sm-arch>sm61</sm-arch>
|
<sm-arch>sm61</sm-arch>
|
||||||
<sm-arch>sm70</sm-arch>
|
<sm-arch>sm70</sm-arch>
|
||||||
|
<sm-arch>sm72</sm-arch>
|
||||||
<sm-arch>sm75</sm-arch>
|
<sm-arch>sm75</sm-arch>
|
||||||
<supported_envs>
|
<supported_envs>
|
||||||
<env>
|
<env>
|
||||||
|
|
|
@ -10,7 +10,7 @@ CUDA Systems Integration, Unified Memory, CUDA Streams and Events, Pinned System
|
||||||
|
|
||||||
## Supported SM Architectures
|
## Supported SM Architectures
|
||||||
|
|
||||||
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
## Supported OSes
|
## Supported OSes
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ cudaMallocManaged, cudaStreamAttachMemAsync, cudaMemcpyAsync, cudaMallocHost, cu
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -105,6 +105,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -105,6 +105,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -105,6 +105,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -106,6 +106,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
304
Samples/bandwidthTest/Makefile
Normal file
|
@ -0,0 +1,304 @@
|
||||||
|
################################################################################
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions
|
||||||
|
# are met:
|
||||||
|
# * Redistributions of source code must retain the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer in the
|
||||||
|
# documentation and/or other materials provided with the distribution.
|
||||||
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||||
|
# contributors may be used to endorse or promote products derived
|
||||||
|
# from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||||
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||||
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||||
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
#
|
||||||
|
################################################################################
|
||||||
|
#
|
||||||
|
# Makefile project only supported on Mac OS X and Linux Platforms)
|
||||||
|
#
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Location of the CUDA Toolkit
|
||||||
|
CUDA_PATH ?= /usr/local/cuda
|
||||||
|
|
||||||
|
##############################
|
||||||
|
# start deprecated interface #
|
||||||
|
##############################
|
||||||
|
ifeq ($(x86_64),1)
|
||||||
|
$(info WARNING - x86_64 variable has been deprecated)
|
||||||
|
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
|
||||||
|
TARGET_ARCH ?= x86_64
|
||||||
|
endif
|
||||||
|
ifeq ($(ARMv7),1)
|
||||||
|
$(info WARNING - ARMv7 variable has been deprecated)
|
||||||
|
$(info WARNING - please use TARGET_ARCH=armv7l instead)
|
||||||
|
TARGET_ARCH ?= armv7l
|
||||||
|
endif
|
||||||
|
ifeq ($(aarch64),1)
|
||||||
|
$(info WARNING - aarch64 variable has been deprecated)
|
||||||
|
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
|
||||||
|
TARGET_ARCH ?= aarch64
|
||||||
|
endif
|
||||||
|
ifeq ($(ppc64le),1)
|
||||||
|
$(info WARNING - ppc64le variable has been deprecated)
|
||||||
|
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
|
||||||
|
TARGET_ARCH ?= ppc64le
|
||||||
|
endif
|
||||||
|
ifneq ($(GCC),)
|
||||||
|
$(info WARNING - GCC variable has been deprecated)
|
||||||
|
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
|
||||||
|
HOST_COMPILER ?= $(GCC)
|
||||||
|
endif
|
||||||
|
ifneq ($(abi),)
|
||||||
|
$(error ERROR - abi variable has been removed)
|
||||||
|
endif
|
||||||
|
############################
|
||||||
|
# end deprecated interface #
|
||||||
|
############################
|
||||||
|
|
||||||
|
# architecture
|
||||||
|
HOST_ARCH := $(shell uname -m)
|
||||||
|
TARGET_ARCH ?= $(HOST_ARCH)
|
||||||
|
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
|
||||||
|
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||||
|
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
|
||||||
|
TARGET_SIZE := 64
|
||||||
|
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
|
||||||
|
TARGET_SIZE := 32
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
TARGET_SIZE := $(shell getconf LONG_BIT)
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
|
||||||
|
endif
|
||||||
|
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||||
|
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
|
||||||
|
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
|
||||||
|
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
|
||||||
|
TARGET_ARCH = armv7l
|
||||||
|
endif
|
||||||
|
|
||||||
|
# operating system
|
||||||
|
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
|
||||||
|
TARGET_OS ?= $(HOST_OS)
|
||||||
|
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
|
||||||
|
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
|
||||||
|
endif
|
||||||
|
|
||||||
|
# host compiler
|
||||||
|
ifeq ($(TARGET_OS),darwin)
|
||||||
|
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
|
||||||
|
HOST_COMPILER ?= clang++
|
||||||
|
endif
|
||||||
|
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||||
|
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
|
||||||
|
ifeq ($(TARGET_OS),linux)
|
||||||
|
HOST_COMPILER ?= arm-linux-gnueabihf-g++
|
||||||
|
else ifeq ($(TARGET_OS),qnx)
|
||||||
|
ifeq ($(QNX_HOST),)
|
||||||
|
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||||
|
endif
|
||||||
|
ifeq ($(QNX_TARGET),)
|
||||||
|
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||||
|
endif
|
||||||
|
export QNX_HOST
|
||||||
|
export QNX_TARGET
|
||||||
|
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
|
||||||
|
else ifeq ($(TARGET_OS),android)
|
||||||
|
HOST_COMPILER ?= arm-linux-androideabi-g++
|
||||||
|
endif
|
||||||
|
else ifeq ($(TARGET_ARCH),aarch64)
|
||||||
|
ifeq ($(TARGET_OS), linux)
|
||||||
|
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||||
|
else ifeq ($(TARGET_OS),qnx)
|
||||||
|
ifeq ($(QNX_HOST),)
|
||||||
|
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||||
|
endif
|
||||||
|
ifeq ($(QNX_TARGET),)
|
||||||
|
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||||
|
endif
|
||||||
|
export QNX_HOST
|
||||||
|
export QNX_TARGET
|
||||||
|
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
|
||||||
|
else ifeq ($(TARGET_OS), android)
|
||||||
|
HOST_COMPILER ?= aarch64-linux-android-clang++
|
||||||
|
endif
|
||||||
|
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||||
|
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
HOST_COMPILER ?= g++
|
||||||
|
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
|
||||||
|
|
||||||
|
# internal flags
|
||||||
|
NVCCFLAGS := -m${TARGET_SIZE}
|
||||||
|
CCFLAGS :=
|
||||||
|
LDFLAGS :=
|
||||||
|
|
||||||
|
# build flags
|
||||||
|
ifeq ($(TARGET_OS),darwin)
|
||||||
|
LDFLAGS += -rpath $(CUDA_PATH)/lib
|
||||||
|
CCFLAGS += -arch $(HOST_ARCH)
|
||||||
|
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
|
||||||
|
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
|
||||||
|
CCFLAGS += -mfloat-abi=hard
|
||||||
|
else ifeq ($(TARGET_OS),android)
|
||||||
|
LDFLAGS += -pie
|
||||||
|
CCFLAGS += -fpie -fpic -fexceptions
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||||
|
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||||
|
ifneq ($(TARGET_FS),)
|
||||||
|
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||||
|
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||||
|
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||||
|
endif
|
||||||
|
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||||
|
ifneq ($(TARGET_FS),)
|
||||||
|
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||||
|
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||||
|
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||||
|
endif
|
||||||
|
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
|
||||||
|
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
|
||||||
|
CCFLAGS += -isystem=$(TARGET_FS)/usr/include
|
||||||
|
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET_OS),qnx)
|
||||||
|
CCFLAGS += -DWIN_INTERFACE_CUSTOM
|
||||||
|
LDFLAGS += -lsocket
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Install directory of different arch
|
||||||
|
CUDA_INSTALL_TARGET_DIR :=
|
||||||
|
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
|
||||||
|
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Debug build flags
|
||||||
|
ifeq ($(dbg),1)
|
||||||
|
NVCCFLAGS += -g -G
|
||||||
|
BUILD_TYPE := debug
|
||||||
|
else
|
||||||
|
BUILD_TYPE := release
|
||||||
|
endif
|
||||||
|
|
||||||
|
ALL_CCFLAGS :=
|
||||||
|
ALL_CCFLAGS += $(NVCCFLAGS)
|
||||||
|
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
|
||||||
|
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
|
||||||
|
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||||
|
|
||||||
|
SAMPLE_ENABLED := 1
|
||||||
|
|
||||||
|
ALL_LDFLAGS :=
|
||||||
|
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||||
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||||
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
|
||||||
|
|
||||||
|
# Common includes and paths for CUDA
|
||||||
|
INCLUDES := -I../../Common
|
||||||
|
LIBRARIES :=
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Gencode arguments
|
||||||
|
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
|
||||||
|
SMS ?= 30 35 37 50 52 60 61 70 72 75
|
||||||
|
else
|
||||||
|
SMS ?= 30 35 37 50 52 60 61 70 75
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(SMS),)
|
||||||
|
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(GENCODE_FLAGS),)
|
||||||
|
# Generate SASS code for each SM architecture listed in $(SMS)
|
||||||
|
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
|
||||||
|
|
||||||
|
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
|
||||||
|
HIGHEST_SM := $(lastword $(sort $(SMS)))
|
||||||
|
ifneq ($(HIGHEST_SM),)
|
||||||
|
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(SAMPLE_ENABLED),0)
|
||||||
|
EXEC ?= @echo "[@]"
|
||||||
|
endif
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Target rules
|
||||||
|
all: build
|
||||||
|
|
||||||
|
build: bandwidthTest
|
||||||
|
|
||||||
|
check.deps:
|
||||||
|
ifeq ($(SAMPLE_ENABLED),0)
|
||||||
|
@echo "Sample will be waived due to the above missing dependencies"
|
||||||
|
else
|
||||||
|
@echo "Sample is ready - all dependencies have been met"
|
||||||
|
endif
|
||||||
|
|
||||||
|
bandwidthTest.o:bandwidthTest.cu
|
||||||
|
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
|
||||||
|
|
||||||
|
bandwidthTest: bandwidthTest.o
|
||||||
|
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
|
||||||
|
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||||
|
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||||
|
|
||||||
|
run: build
|
||||||
|
$(EXEC) ./bandwidthTest
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f bandwidthTest bandwidthTest.o
|
||||||
|
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/bandwidthTest
|
||||||
|
|
||||||
|
clobber: clean
|
79
Samples/bandwidthTest/NsightEclipse.xml
Normal file
|
@ -0,0 +1,79 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
|
||||||
|
<entry>
|
||||||
|
<name>bandwidthTest</name>
|
||||||
|
<cuda_api_list>
|
||||||
|
<toolkit>cudaSetDevice</toolkit>
|
||||||
|
<toolkit>cudaHostAlloc</toolkit>
|
||||||
|
<toolkit>cudaFree</toolkit>
|
||||||
|
<toolkit>cudaMallocHost</toolkit>
|
||||||
|
<toolkit>cudaFreeHost</toolkit>
|
||||||
|
<toolkit>cudaMemcpy</toolkit>
|
||||||
|
<toolkit>cudaMemcpyAsync</toolkit>
|
||||||
|
<toolkit>cudaEventCreate</toolkit>
|
||||||
|
<toolkit>cudaEventRecord</toolkit>
|
||||||
|
<toolkit>cudaEventDestroy</toolkit>
|
||||||
|
<toolkit>cudaDeviceSynchronize</toolkit>
|
||||||
|
<toolkit>cudaEventElapsedTime</toolkit>
|
||||||
|
</cuda_api_list>
|
||||||
|
<description><![CDATA[This is a simple test program to measure the memcopy bandwidth of the GPU and memcpy bandwidth across PCI-e. This test application is capable of measuring device to device copy bandwidth, host to device copy bandwidth for pageable and page-locked memory, and device to host copy bandwidth for pageable and page-locked memory.]]></description>
|
||||||
|
<devicecompilation>whole</devicecompilation>
|
||||||
|
<includepaths>
|
||||||
|
<path>./</path>
|
||||||
|
<path>../</path>
|
||||||
|
<path>../../common/inc</path>
|
||||||
|
</includepaths>
|
||||||
|
<keyconcepts>
|
||||||
|
<concept level="basic">CUDA Streams and Events</concept>
|
||||||
|
<concept level="basic">Performance Strategies</concept>
|
||||||
|
</keyconcepts>
|
||||||
|
<keywords>
|
||||||
|
<keyword>GPGPU</keyword>
|
||||||
|
<keyword>bandwidth</keyword>
|
||||||
|
</keywords>
|
||||||
|
<libraries>
|
||||||
|
</libraries>
|
||||||
|
<librarypaths>
|
||||||
|
</librarypaths>
|
||||||
|
<nsight_eclipse>true</nsight_eclipse>
|
||||||
|
<primary_file>bandwidthTest.cu</primary_file>
|
||||||
|
<scopes>
|
||||||
|
<scope>1:CUDA Basic Topics</scope>
|
||||||
|
<scope>1:Performance Strategies</scope>
|
||||||
|
</scopes>
|
||||||
|
<sm-arch>sm30</sm-arch>
|
||||||
|
<sm-arch>sm35</sm-arch>
|
||||||
|
<sm-arch>sm37</sm-arch>
|
||||||
|
<sm-arch>sm50</sm-arch>
|
||||||
|
<sm-arch>sm52</sm-arch>
|
||||||
|
<sm-arch>sm60</sm-arch>
|
||||||
|
<sm-arch>sm61</sm-arch>
|
||||||
|
<sm-arch>sm70</sm-arch>
|
||||||
|
<sm-arch>sm72</sm-arch>
|
||||||
|
<sm-arch>sm75</sm-arch>
|
||||||
|
<supported_envs>
|
||||||
|
<env>
|
||||||
|
<arch>x86_64</arch>
|
||||||
|
<platform>linux</platform>
|
||||||
|
</env>
|
||||||
|
<env>
|
||||||
|
<platform>windows7</platform>
|
||||||
|
</env>
|
||||||
|
<env>
|
||||||
|
<arch>x86_64</arch>
|
||||||
|
<platform>macosx</platform>
|
||||||
|
</env>
|
||||||
|
<env>
|
||||||
|
<arch>arm</arch>
|
||||||
|
</env>
|
||||||
|
<env>
|
||||||
|
<arch>ppc64le</arch>
|
||||||
|
<platform>linux</platform>
|
||||||
|
</env>
|
||||||
|
</supported_envs>
|
||||||
|
<supported_sm_architectures>
|
||||||
|
<include>all</include>
|
||||||
|
</supported_sm_architectures>
|
||||||
|
<title>Bandwidth Test</title>
|
||||||
|
<type>exe</type>
|
||||||
|
</entry>
|
94
Samples/bandwidthTest/README.md
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
# bandwidthTest - Bandwidth Test
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
This is a simple test program to measure the memcopy bandwidth of the GPU and memcpy bandwidth across PCI-e. This test application is capable of measuring device to device copy bandwidth, host to device copy bandwidth for pageable and page-locked memory, and device to host copy bandwidth for pageable and page-locked memory.
|
||||||
|
|
||||||
|
## Key Concepts
|
||||||
|
|
||||||
|
CUDA Streams and Events, Performance Strategies
|
||||||
|
|
||||||
|
## Supported SM Architectures
|
||||||
|
|
||||||
|
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
|
## Supported OSes
|
||||||
|
|
||||||
|
Linux, Windows, MacOSX
|
||||||
|
|
||||||
|
## Supported CPU Architecture
|
||||||
|
|
||||||
|
x86_64, ppc64le, armv7l
|
||||||
|
|
||||||
|
## CUDA APIs involved
|
||||||
|
|
||||||
|
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
|
||||||
|
cudaSetDevice, cudaHostAlloc, cudaFree, cudaMallocHost, cudaFreeHost, cudaMemcpy, cudaMemcpyAsync, cudaEventCreate, cudaEventRecord, cudaEventDestroy, cudaDeviceSynchronize, cudaEventElapsedTime
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
|
## Build and Run
|
||||||
|
|
||||||
|
### Windows
|
||||||
|
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
|
||||||
|
```
|
||||||
|
*_vs<version>.sln - for Visual Studio <version>
|
||||||
|
```
|
||||||
|
Each individual sample has its own set of solution files in its directory:
|
||||||
|
|
||||||
|
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
|
||||||
|
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
|
||||||
|
|
||||||
|
### Linux
|
||||||
|
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
|
||||||
|
```
|
||||||
|
$ cd <sample_dir>
|
||||||
|
$ make
|
||||||
|
```
|
||||||
|
The samples makefiles can take advantage of certain options:
|
||||||
|
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
|
||||||
|
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
|
||||||
|
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
|
||||||
|
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
|
||||||
|
* **dbg=1** - build with debug symbols
|
||||||
|
```
|
||||||
|
$ make dbg=1
|
||||||
|
```
|
||||||
|
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
|
||||||
|
```
|
||||||
|
$ make SMS="50 60"
|
||||||
|
```
|
||||||
|
|
||||||
|
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
|
||||||
|
```
|
||||||
|
$ make HOST_COMPILER=g++
|
||||||
|
```
|
||||||
|
|
||||||
|
### Mac
|
||||||
|
The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
|
||||||
|
```
|
||||||
|
$ cd <sample_dir>
|
||||||
|
$ make
|
||||||
|
```
|
||||||
|
|
||||||
|
The samples makefiles can take advantage of certain options:
|
||||||
|
|
||||||
|
* **dbg=1** - build with debug symbols
|
||||||
|
```
|
||||||
|
$ make dbg=1
|
||||||
|
```
|
||||||
|
|
||||||
|
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
|
||||||
|
```
|
||||||
|
$ make SMS="A B ..."
|
||||||
|
```
|
||||||
|
|
||||||
|
* **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
|
||||||
|
```
|
||||||
|
$ make HOST_COMPILER=clang
|
||||||
|
```
|
||||||
|
|
||||||
|
## References (for more details)
|
||||||
|
|
969
Samples/bandwidthTest/bandwidthTest.cu
Normal file
|
@ -0,0 +1,969 @@
|
||||||
|
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived
|
||||||
|
* from this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is a simple test program to measure the memcopy bandwidth of the GPU.
|
||||||
|
* It can measure device to device copy bandwidth, host to device copy bandwidth
|
||||||
|
* for pageable and pinned memory, and device to host copy bandwidth for
|
||||||
|
* pageable and pinned memory.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* ./bandwidthTest [option]...
|
||||||
|
*/
|
||||||
|
|
||||||
|
// CUDA runtime
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
// includes
|
||||||
|
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
|
||||||
|
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
|
||||||
|
|
||||||
|
#include <cuda.h>
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <iostream>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
static const char *sSDKsample = "CUDA Bandwidth Test";
|
||||||
|
|
||||||
|
// defines, project
|
||||||
|
#define MEMCOPY_ITERATIONS 100
|
||||||
|
#define DEFAULT_SIZE (32 * (1e6)) // 32 M
|
||||||
|
#define DEFAULT_INCREMENT (4 * (1e6)) // 4 M
|
||||||
|
#define CACHE_CLEAR_SIZE (16 * (1e6)) // 16 M
|
||||||
|
|
||||||
|
// shmoo mode defines
|
||||||
|
#define SHMOO_MEMSIZE_MAX (64 * (1e6)) // 64 M
|
||||||
|
#define SHMOO_MEMSIZE_START (1e3) // 1 KB
|
||||||
|
#define SHMOO_INCREMENT_1KB (1e3) // 1 KB
|
||||||
|
#define SHMOO_INCREMENT_2KB (2 * 1e3) // 2 KB
|
||||||
|
#define SHMOO_INCREMENT_10KB (10 * (1e3)) // 10KB
|
||||||
|
#define SHMOO_INCREMENT_100KB (100 * (1e3)) // 100 KB
|
||||||
|
#define SHMOO_INCREMENT_1MB (1e6) // 1 MB
|
||||||
|
#define SHMOO_INCREMENT_2MB (2 * 1e6) // 2 MB
|
||||||
|
#define SHMOO_INCREMENT_4MB (4 * 1e6) // 4 MB
|
||||||
|
#define SHMOO_LIMIT_20KB (20 * (1e3)) // 20 KB
|
||||||
|
#define SHMOO_LIMIT_50KB (50 * (1e3)) // 50 KB
|
||||||
|
#define SHMOO_LIMIT_100KB (100 * (1e3)) // 100 KB
|
||||||
|
#define SHMOO_LIMIT_1MB (1e6) // 1 MB
|
||||||
|
#define SHMOO_LIMIT_16MB (16 * 1e6) // 16 MB
|
||||||
|
#define SHMOO_LIMIT_32MB (32 * 1e6) // 32 MB
|
||||||
|
|
||||||
|
// CPU cache flush
|
||||||
|
#define FLUSH_SIZE (256 * 1024 * 1024)
|
||||||
|
char *flush_buf;
|
||||||
|
|
||||||
|
// enums, project
|
||||||
|
enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
|
||||||
|
enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
|
||||||
|
enum printMode { USER_READABLE, CSV };
|
||||||
|
enum memoryMode { PINNED, PAGEABLE };
|
||||||
|
|
||||||
|
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device",
|
||||||
|
"Device to Device", NULL};
|
||||||
|
|
||||||
|
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
|
||||||
|
|
||||||
|
// if true, use CPU based timing for everything
|
||||||
|
static bool bDontUseGPUTiming;
|
||||||
|
|
||||||
|
int *pArgc = NULL;
|
||||||
|
char **pArgv = NULL;
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// declaration, forward
|
||||||
|
int runTest(const int argc, const char **argv);
|
||||||
|
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
|
||||||
|
testMode mode, memcpyKind kind, printMode printmode,
|
||||||
|
memoryMode memMode, int startDevice, int endDevice, bool wc);
|
||||||
|
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
|
||||||
|
memoryMode memMode, int startDevice, int endDevice,
|
||||||
|
bool wc);
|
||||||
|
void testBandwidthRange(unsigned int start, unsigned int end,
|
||||||
|
unsigned int increment, memcpyKind kind,
|
||||||
|
printMode printmode, memoryMode memMode,
|
||||||
|
int startDevice, int endDevice, bool wc);
|
||||||
|
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
|
||||||
|
memoryMode memMode, int startDevice, int endDevice,
|
||||||
|
bool wc);
|
||||||
|
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
|
||||||
|
bool wc);
|
||||||
|
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
|
||||||
|
bool wc);
|
||||||
|
float testDeviceToDeviceTransfer(unsigned int memSize);
|
||||||
|
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
|
||||||
|
unsigned int count, memcpyKind kind,
|
||||||
|
memoryMode memMode, int iNumDevs, bool wc);
|
||||||
|
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
|
||||||
|
unsigned int count, memcpyKind kind, memoryMode memMode,
|
||||||
|
int iNumDevs, bool wc);
|
||||||
|
void printHelp(void);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Program main
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
pArgc = &argc;
|
||||||
|
pArgv = argv;
|
||||||
|
|
||||||
|
flush_buf = (char *)malloc(FLUSH_SIZE);
|
||||||
|
|
||||||
|
// set logfile name and start logs
|
||||||
|
printf("[%s] - Starting...\n", sSDKsample);
|
||||||
|
|
||||||
|
int iRetVal = runTest(argc, (const char **)argv);
|
||||||
|
|
||||||
|
if (iRetVal < 0) {
|
||||||
|
checkCudaErrors(cudaSetDevice(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
// finish
|
||||||
|
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
|
||||||
|
|
||||||
|
printf(
|
||||||
|
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
||||||
|
"Results may vary when GPU Boost is enabled.\n");
|
||||||
|
|
||||||
|
free(flush_buf);
|
||||||
|
|
||||||
|
exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Parse args, run the appropriate tests
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
int runTest(const int argc, const char **argv) {
|
||||||
|
int start = DEFAULT_SIZE;
|
||||||
|
int end = DEFAULT_SIZE;
|
||||||
|
int startDevice = 0;
|
||||||
|
int endDevice = 0;
|
||||||
|
int increment = DEFAULT_INCREMENT;
|
||||||
|
testMode mode = QUICK_MODE;
|
||||||
|
bool htod = false;
|
||||||
|
bool dtoh = false;
|
||||||
|
bool dtod = false;
|
||||||
|
bool wc = false;
|
||||||
|
char *modeStr;
|
||||||
|
char *device = NULL;
|
||||||
|
printMode printmode = USER_READABLE;
|
||||||
|
char *memModeStr = NULL;
|
||||||
|
memoryMode memMode = PINNED;
|
||||||
|
|
||||||
|
// process command line args
|
||||||
|
if (checkCmdLineFlag(argc, argv, "help")) {
|
||||||
|
printHelp();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (checkCmdLineFlag(argc, argv, "csv")) {
|
||||||
|
printmode = CSV;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
|
||||||
|
if (strcmp(memModeStr, "pageable") == 0) {
|
||||||
|
memMode = PAGEABLE;
|
||||||
|
} else if (strcmp(memModeStr, "pinned") == 0) {
|
||||||
|
memMode = PINNED;
|
||||||
|
} else {
|
||||||
|
printf("Invalid memory mode - valid modes are pageable or pinned\n");
|
||||||
|
printf("See --help for more information\n");
|
||||||
|
return -1000;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// default - pinned memory
|
||||||
|
memMode = PINNED;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (getCmdLineArgumentString(argc, argv, "device", &device)) {
|
||||||
|
int deviceCount;
|
||||||
|
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
|
||||||
|
|
||||||
|
if (error_id != cudaSuccess) {
|
||||||
|
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id,
|
||||||
|
cudaGetErrorString(error_id));
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (deviceCount == 0) {
|
||||||
|
printf("!!!!!No devices found!!!!!\n");
|
||||||
|
return -2000;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (strcmp(device, "all") == 0) {
|
||||||
|
printf(
|
||||||
|
"\n!!!!!Cumulative Bandwidth to be computed from all the devices "
|
||||||
|
"!!!!!!\n\n");
|
||||||
|
startDevice = 0;
|
||||||
|
endDevice = deviceCount - 1;
|
||||||
|
} else {
|
||||||
|
startDevice = endDevice = atoi(device);
|
||||||
|
|
||||||
|
if (startDevice >= deviceCount || startDevice < 0) {
|
||||||
|
printf(
|
||||||
|
"\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
|
||||||
|
"used !!!!!\n",
|
||||||
|
startDevice, 0);
|
||||||
|
startDevice = endDevice = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Running on...\n\n");
|
||||||
|
|
||||||
|
for (int currentDevice = startDevice; currentDevice <= endDevice;
|
||||||
|
currentDevice++) {
|
||||||
|
cudaDeviceProp deviceProp;
|
||||||
|
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
|
||||||
|
|
||||||
|
if (error_id == cudaSuccess) {
|
||||||
|
printf(" Device %d: %s\n", currentDevice, deviceProp.name);
|
||||||
|
|
||||||
|
if (deviceProp.computeMode == cudaComputeModeProhibited) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"Error: device is running in <Compute Mode Prohibited>, no "
|
||||||
|
"threads can use ::cudaSetDevice().\n");
|
||||||
|
checkCudaErrors(cudaSetDevice(currentDevice));
|
||||||
|
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id,
|
||||||
|
cudaGetErrorString(error_id));
|
||||||
|
checkCudaErrors(cudaSetDevice(currentDevice));
|
||||||
|
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) {
|
||||||
|
// figure out the mode
|
||||||
|
if (strcmp(modeStr, "quick") == 0) {
|
||||||
|
printf(" Quick Mode\n\n");
|
||||||
|
mode = QUICK_MODE;
|
||||||
|
} else if (strcmp(modeStr, "shmoo") == 0) {
|
||||||
|
printf(" Shmoo Mode\n\n");
|
||||||
|
mode = SHMOO_MODE;
|
||||||
|
} else if (strcmp(modeStr, "range") == 0) {
|
||||||
|
printf(" Range Mode\n\n");
|
||||||
|
mode = RANGE_MODE;
|
||||||
|
} else {
|
||||||
|
printf("Invalid mode - valid modes are quick, range, or shmoo\n");
|
||||||
|
printf("See --help for more information\n");
|
||||||
|
return -3000;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// default mode - quick
|
||||||
|
printf(" Quick Mode\n\n");
|
||||||
|
mode = QUICK_MODE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (checkCmdLineFlag(argc, argv, "htod")) {
|
||||||
|
htod = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (checkCmdLineFlag(argc, argv, "dtoh")) {
|
||||||
|
dtoh = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (checkCmdLineFlag(argc, argv, "dtod")) {
|
||||||
|
dtod = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if CUDART_VERSION >= 2020
|
||||||
|
|
||||||
|
if (checkCmdLineFlag(argc, argv, "wc")) {
|
||||||
|
wc = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (checkCmdLineFlag(argc, argv, "cputiming")) {
|
||||||
|
bDontUseGPUTiming = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!htod && !dtoh && !dtod) {
|
||||||
|
// default: All
|
||||||
|
htod = true;
|
||||||
|
dtoh = true;
|
||||||
|
dtod = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (RANGE_MODE == mode) {
|
||||||
|
if (checkCmdLineFlag(argc, (const char **)argv, "start")) {
|
||||||
|
start = getCmdLineArgumentInt(argc, argv, "start");
|
||||||
|
|
||||||
|
if (start <= 0) {
|
||||||
|
printf("Illegal argument - start must be greater than zero\n");
|
||||||
|
return -4000;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
printf("Must specify a starting size in range mode\n");
|
||||||
|
printf("See --help for more information\n");
|
||||||
|
return -5000;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (checkCmdLineFlag(argc, (const char **)argv, "end")) {
|
||||||
|
end = getCmdLineArgumentInt(argc, argv, "end");
|
||||||
|
|
||||||
|
if (end <= 0) {
|
||||||
|
printf("Illegal argument - end must be greater than zero\n");
|
||||||
|
return -6000;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (start > end) {
|
||||||
|
printf("Illegal argument - start is greater than end\n");
|
||||||
|
return -7000;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
printf("Must specify an end size in range mode.\n");
|
||||||
|
printf("See --help for more information\n");
|
||||||
|
return -8000;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (checkCmdLineFlag(argc, argv, "increment")) {
|
||||||
|
increment = getCmdLineArgumentInt(argc, argv, "increment");
|
||||||
|
|
||||||
|
if (increment <= 0) {
|
||||||
|
printf("Illegal argument - increment must be greater than zero\n");
|
||||||
|
return -9000;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
printf("Must specify an increment in user mode\n");
|
||||||
|
printf("See --help for more information\n");
|
||||||
|
return -10000;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (htod) {
|
||||||
|
testBandwidth((unsigned int)start, (unsigned int)end,
|
||||||
|
(unsigned int)increment, mode, HOST_TO_DEVICE, printmode,
|
||||||
|
memMode, startDevice, endDevice, wc);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dtoh) {
|
||||||
|
testBandwidth((unsigned int)start, (unsigned int)end,
|
||||||
|
(unsigned int)increment, mode, DEVICE_TO_HOST, printmode,
|
||||||
|
memMode, startDevice, endDevice, wc);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dtod) {
|
||||||
|
testBandwidth((unsigned int)start, (unsigned int)end,
|
||||||
|
(unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode,
|
||||||
|
memMode, startDevice, endDevice, wc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that we reset all CUDA Devices in question
|
||||||
|
for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) {
|
||||||
|
cudaSetDevice(nDevice);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Run a bandwidth test
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
|
||||||
|
testMode mode, memcpyKind kind, printMode printmode,
|
||||||
|
memoryMode memMode, int startDevice, int endDevice,
|
||||||
|
bool wc) {
|
||||||
|
switch (mode) {
|
||||||
|
case QUICK_MODE:
|
||||||
|
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice,
|
||||||
|
endDevice, wc);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case RANGE_MODE:
|
||||||
|
testBandwidthRange(start, end, increment, kind, printmode, memMode,
|
||||||
|
startDevice, endDevice, wc);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SHMOO_MODE:
|
||||||
|
testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
// Run a quick mode bandwidth test
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
|
||||||
|
memoryMode memMode, int startDevice, int endDevice,
|
||||||
|
bool wc) {
|
||||||
|
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode,
|
||||||
|
startDevice, endDevice, wc);
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////
|
||||||
|
// Run a range mode bandwidth test
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
void testBandwidthRange(unsigned int start, unsigned int end,
|
||||||
|
unsigned int increment, memcpyKind kind,
|
||||||
|
printMode printmode, memoryMode memMode,
|
||||||
|
int startDevice, int endDevice, bool wc) {
|
||||||
|
// count the number of copies we're going to run
|
||||||
|
unsigned int count = 1 + ((end - start) / increment);
|
||||||
|
|
||||||
|
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
|
||||||
|
double *bandwidths = (double *)malloc(count * sizeof(double));
|
||||||
|
|
||||||
|
// Before calculating the cumulative bandwidth, initialize bandwidths array to
|
||||||
|
// NULL
|
||||||
|
for (unsigned int i = 0; i < count; i++) {
|
||||||
|
bandwidths[i] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the device asked by the user
|
||||||
|
for (int currentDevice = startDevice; currentDevice <= endDevice;
|
||||||
|
currentDevice++) {
|
||||||
|
cudaSetDevice(currentDevice);
|
||||||
|
|
||||||
|
// run each of the copies
|
||||||
|
for (unsigned int i = 0; i < count; i++) {
|
||||||
|
memSizes[i] = start + i * increment;
|
||||||
|
|
||||||
|
switch (kind) {
|
||||||
|
case DEVICE_TO_HOST:
|
||||||
|
bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case HOST_TO_DEVICE:
|
||||||
|
bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case DEVICE_TO_DEVICE:
|
||||||
|
bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // Complete the bandwidth computation on all the devices
|
||||||
|
|
||||||
|
// print results
|
||||||
|
if (printmode == CSV) {
|
||||||
|
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
|
||||||
|
(1 + endDevice - startDevice), wc);
|
||||||
|
} else {
|
||||||
|
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
|
||||||
|
(1 + endDevice - startDevice), wc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// clean up
|
||||||
|
free(memSizes);
|
||||||
|
free(bandwidths);
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Intense shmoo mode - covers a large range of values with varying increments
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
|
||||||
|
memoryMode memMode, int startDevice, int endDevice,
|
||||||
|
bool wc) {
|
||||||
|
// count the number of copies to make
|
||||||
|
unsigned int count =
|
||||||
|
1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) +
|
||||||
|
((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) +
|
||||||
|
((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) +
|
||||||
|
((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) +
|
||||||
|
((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) +
|
||||||
|
((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) +
|
||||||
|
((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
|
||||||
|
|
||||||
|
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
|
||||||
|
double *bandwidths = (double *)malloc(count * sizeof(double));
|
||||||
|
|
||||||
|
// Before calculating the cumulative bandwidth, initialize bandwidths array to
|
||||||
|
// NULL
|
||||||
|
for (unsigned int i = 0; i < count; i++) {
|
||||||
|
bandwidths[i] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the device asked by the user
|
||||||
|
for (int currentDevice = startDevice; currentDevice <= endDevice;
|
||||||
|
currentDevice++) {
|
||||||
|
cudaSetDevice(currentDevice);
|
||||||
|
// Run the shmoo
|
||||||
|
int iteration = 0;
|
||||||
|
unsigned int memSize = 0;
|
||||||
|
|
||||||
|
while (memSize <= SHMOO_MEMSIZE_MAX) {
|
||||||
|
if (memSize < SHMOO_LIMIT_20KB) {
|
||||||
|
memSize += SHMOO_INCREMENT_1KB;
|
||||||
|
} else if (memSize < SHMOO_LIMIT_50KB) {
|
||||||
|
memSize += SHMOO_INCREMENT_2KB;
|
||||||
|
} else if (memSize < SHMOO_LIMIT_100KB) {
|
||||||
|
memSize += SHMOO_INCREMENT_10KB;
|
||||||
|
} else if (memSize < SHMOO_LIMIT_1MB) {
|
||||||
|
memSize += SHMOO_INCREMENT_100KB;
|
||||||
|
} else if (memSize < SHMOO_LIMIT_16MB) {
|
||||||
|
memSize += SHMOO_INCREMENT_1MB;
|
||||||
|
} else if (memSize < SHMOO_LIMIT_32MB) {
|
||||||
|
memSize += SHMOO_INCREMENT_2MB;
|
||||||
|
} else {
|
||||||
|
memSize += SHMOO_INCREMENT_4MB;
|
||||||
|
}
|
||||||
|
|
||||||
|
memSizes[iteration] = memSize;
|
||||||
|
|
||||||
|
switch (kind) {
|
||||||
|
case DEVICE_TO_HOST:
|
||||||
|
bandwidths[iteration] +=
|
||||||
|
testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case HOST_TO_DEVICE:
|
||||||
|
bandwidths[iteration] +=
|
||||||
|
testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case DEVICE_TO_DEVICE:
|
||||||
|
bandwidths[iteration] +=
|
||||||
|
testDeviceToDeviceTransfer(memSizes[iteration]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
iteration++;
|
||||||
|
printf(".");
|
||||||
|
fflush(0);
|
||||||
|
}
|
||||||
|
} // Complete the bandwidth computation on all the devices
|
||||||
|
|
||||||
|
// print results
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
if (CSV == printmode) {
|
||||||
|
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
|
||||||
|
(1 + endDevice - startDevice), wc);
|
||||||
|
} else {
|
||||||
|
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
|
||||||
|
(1 + endDevice - startDevice), wc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// clean up
|
||||||
|
free(memSizes);
|
||||||
|
free(bandwidths);
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
// test the bandwidth of a device to host memcopy of a specific size
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
|
||||||
|
bool wc) {
|
||||||
|
StopWatchInterface *timer = NULL;
|
||||||
|
float elapsedTimeInMs = 0.0f;
|
||||||
|
float bandwidthInGBs = 0.0f;
|
||||||
|
unsigned char *h_idata = NULL;
|
||||||
|
unsigned char *h_odata = NULL;
|
||||||
|
cudaEvent_t start, stop;
|
||||||
|
|
||||||
|
sdkCreateTimer(&timer);
|
||||||
|
checkCudaErrors(cudaEventCreate(&start));
|
||||||
|
checkCudaErrors(cudaEventCreate(&stop));
|
||||||
|
|
||||||
|
// allocate host memory
|
||||||
|
if (PINNED == memMode) {
|
||||||
|
// pinned memory mode - use special function to get OS-pinned memory
|
||||||
|
#if CUDART_VERSION >= 2020
|
||||||
|
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize,
|
||||||
|
(wc) ? cudaHostAllocWriteCombined : 0));
|
||||||
|
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
|
||||||
|
(wc) ? cudaHostAllocWriteCombined : 0));
|
||||||
|
#else
|
||||||
|
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
|
||||||
|
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
// pageable memory mode - use malloc
|
||||||
|
h_idata = (unsigned char *)malloc(memSize);
|
||||||
|
h_odata = (unsigned char *)malloc(memSize);
|
||||||
|
|
||||||
|
if (h_idata == 0 || h_odata == 0) {
|
||||||
|
fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize the memory
|
||||||
|
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
|
||||||
|
h_idata[i] = (unsigned char)(i & 0xff);
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate device memory
|
||||||
|
unsigned char *d_idata;
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
|
||||||
|
|
||||||
|
// initialize the device memory
|
||||||
|
checkCudaErrors(
|
||||||
|
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
|
||||||
|
|
||||||
|
// copy data from GPU to Host
|
||||||
|
if (PINNED == memMode) {
|
||||||
|
if (bDontUseGPUTiming) sdkStartTimer(&timer);
|
||||||
|
checkCudaErrors(cudaEventRecord(start, 0));
|
||||||
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
|
||||||
|
cudaMemcpyDeviceToHost, 0));
|
||||||
|
}
|
||||||
|
checkCudaErrors(cudaEventRecord(stop, 0));
|
||||||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
|
||||||
|
if (bDontUseGPUTiming) {
|
||||||
|
sdkStopTimer(&timer);
|
||||||
|
elapsedTimeInMs = sdkGetTimerValue(&timer);
|
||||||
|
sdkResetTimer(&timer);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
elapsedTimeInMs = 0;
|
||||||
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
||||||
|
sdkStartTimer(&timer);
|
||||||
|
checkCudaErrors(
|
||||||
|
cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
|
||||||
|
sdkStopTimer(&timer);
|
||||||
|
elapsedTimeInMs += sdkGetTimerValue(&timer);
|
||||||
|
sdkResetTimer(&timer);
|
||||||
|
memset(flush_buf, i, FLUSH_SIZE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate bandwidth in GB/s
|
||||||
|
double time_s = elapsedTimeInMs / 1e3;
|
||||||
|
bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
|
||||||
|
bandwidthInGBs = bandwidthInGBs / time_s;
|
||||||
|
// clean up memory
|
||||||
|
checkCudaErrors(cudaEventDestroy(stop));
|
||||||
|
checkCudaErrors(cudaEventDestroy(start));
|
||||||
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
|
if (PINNED == memMode) {
|
||||||
|
checkCudaErrors(cudaFreeHost(h_idata));
|
||||||
|
checkCudaErrors(cudaFreeHost(h_odata));
|
||||||
|
} else {
|
||||||
|
free(h_idata);
|
||||||
|
free(h_odata);
|
||||||
|
}
|
||||||
|
|
||||||
|
checkCudaErrors(cudaFree(d_idata));
|
||||||
|
|
||||||
|
return bandwidthInGBs;
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! test the bandwidth of a host to device memcopy of a specific size
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
|
||||||
|
bool wc) {
|
||||||
|
StopWatchInterface *timer = NULL;
|
||||||
|
float elapsedTimeInMs = 0.0f;
|
||||||
|
float bandwidthInGBs = 0.0f;
|
||||||
|
cudaEvent_t start, stop;
|
||||||
|
sdkCreateTimer(&timer);
|
||||||
|
checkCudaErrors(cudaEventCreate(&start));
|
||||||
|
checkCudaErrors(cudaEventCreate(&stop));
|
||||||
|
|
||||||
|
// allocate host memory
|
||||||
|
unsigned char *h_odata = NULL;
|
||||||
|
|
||||||
|
if (PINNED == memMode) {
|
||||||
|
#if CUDART_VERSION >= 2020
|
||||||
|
// pinned memory mode - use special function to get OS-pinned memory
|
||||||
|
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
|
||||||
|
(wc) ? cudaHostAllocWriteCombined : 0));
|
||||||
|
#else
|
||||||
|
// pinned memory mode - use special function to get OS-pinned memory
|
||||||
|
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
// pageable memory mode - use malloc
|
||||||
|
h_odata = (unsigned char *)malloc(memSize);
|
||||||
|
|
||||||
|
if (h_odata == 0) {
|
||||||
|
fprintf(stderr, "Not enough memory available on host to run test!\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
|
||||||
|
unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
|
||||||
|
|
||||||
|
if (h_cacheClear1 == 0 || h_cacheClear2 == 0) {
|
||||||
|
fprintf(stderr, "Not enough memory available on host to run test!\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize the memory
|
||||||
|
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
|
||||||
|
h_odata[i] = (unsigned char)(i & 0xff);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) {
|
||||||
|
h_cacheClear1[i] = (unsigned char)(i & 0xff);
|
||||||
|
h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff));
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate device memory
|
||||||
|
unsigned char *d_idata;
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
|
||||||
|
|
||||||
|
// copy host memory to device memory
|
||||||
|
if (PINNED == memMode) {
|
||||||
|
if (bDontUseGPUTiming) sdkStartTimer(&timer);
|
||||||
|
checkCudaErrors(cudaEventRecord(start, 0));
|
||||||
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
|
||||||
|
cudaMemcpyHostToDevice, 0));
|
||||||
|
}
|
||||||
|
checkCudaErrors(cudaEventRecord(stop, 0));
|
||||||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
|
||||||
|
if (bDontUseGPUTiming) {
|
||||||
|
sdkStopTimer(&timer);
|
||||||
|
elapsedTimeInMs = sdkGetTimerValue(&timer);
|
||||||
|
sdkResetTimer(&timer);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
elapsedTimeInMs = 0;
|
||||||
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
||||||
|
sdkStartTimer(&timer);
|
||||||
|
checkCudaErrors(
|
||||||
|
cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
|
||||||
|
sdkStopTimer(&timer);
|
||||||
|
elapsedTimeInMs += sdkGetTimerValue(&timer);
|
||||||
|
sdkResetTimer(&timer);
|
||||||
|
memset(flush_buf, i, FLUSH_SIZE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate bandwidth in GB/s
|
||||||
|
double time_s = elapsedTimeInMs / 1e3;
|
||||||
|
bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
|
||||||
|
bandwidthInGBs = bandwidthInGBs / time_s;
|
||||||
|
// clean up memory
|
||||||
|
checkCudaErrors(cudaEventDestroy(stop));
|
||||||
|
checkCudaErrors(cudaEventDestroy(start));
|
||||||
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
|
if (PINNED == memMode) {
|
||||||
|
checkCudaErrors(cudaFreeHost(h_odata));
|
||||||
|
} else {
|
||||||
|
free(h_odata);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(h_cacheClear1);
|
||||||
|
free(h_cacheClear2);
|
||||||
|
checkCudaErrors(cudaFree(d_idata));
|
||||||
|
|
||||||
|
return bandwidthInGBs;
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! test the bandwidth of a device to device memcopy of a specific size
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
float testDeviceToDeviceTransfer(unsigned int memSize) {
|
||||||
|
StopWatchInterface *timer = NULL;
|
||||||
|
float elapsedTimeInMs = 0.0f;
|
||||||
|
float bandwidthInGBs = 0.0f;
|
||||||
|
cudaEvent_t start, stop;
|
||||||
|
|
||||||
|
sdkCreateTimer(&timer);
|
||||||
|
checkCudaErrors(cudaEventCreate(&start));
|
||||||
|
checkCudaErrors(cudaEventCreate(&stop));
|
||||||
|
|
||||||
|
// allocate host memory
|
||||||
|
unsigned char *h_idata = (unsigned char *)malloc(memSize);
|
||||||
|
|
||||||
|
if (h_idata == 0) {
|
||||||
|
fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize the host memory
|
||||||
|
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
|
||||||
|
h_idata[i] = (unsigned char)(i & 0xff);
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate device memory
|
||||||
|
unsigned char *d_idata;
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
|
||||||
|
unsigned char *d_odata;
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
|
||||||
|
|
||||||
|
// initialize memory
|
||||||
|
checkCudaErrors(
|
||||||
|
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
|
||||||
|
|
||||||
|
// run the memcopy
|
||||||
|
sdkStartTimer(&timer);
|
||||||
|
checkCudaErrors(cudaEventRecord(start, 0));
|
||||||
|
|
||||||
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
||||||
|
checkCudaErrors(
|
||||||
|
cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
|
||||||
|
}
|
||||||
|
|
||||||
|
checkCudaErrors(cudaEventRecord(stop, 0));
|
||||||
|
|
||||||
|
// Since device to device memory copies are non-blocking,
|
||||||
|
// cudaDeviceSynchronize() is required in order to get
|
||||||
|
// proper timing.
|
||||||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
// get the total elapsed time in ms
|
||||||
|
sdkStopTimer(&timer);
|
||||||
|
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
|
||||||
|
|
||||||
|
if (bDontUseGPUTiming) {
|
||||||
|
elapsedTimeInMs = sdkGetTimerValue(&timer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate bandwidth in GB/s
|
||||||
|
double time_s = elapsedTimeInMs / 1e3;
|
||||||
|
bandwidthInGBs = (2.0f * memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
|
||||||
|
bandwidthInGBs = bandwidthInGBs / time_s;
|
||||||
|
|
||||||
|
// clean up memory
|
||||||
|
sdkDeleteTimer(&timer);
|
||||||
|
free(h_idata);
|
||||||
|
checkCudaErrors(cudaEventDestroy(stop));
|
||||||
|
checkCudaErrors(cudaEventDestroy(start));
|
||||||
|
checkCudaErrors(cudaFree(d_idata));
|
||||||
|
checkCudaErrors(cudaFree(d_odata));
|
||||||
|
|
||||||
|
return bandwidthInGBs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
// print results in an easily read format
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
|
||||||
|
unsigned int count, memcpyKind kind,
|
||||||
|
memoryMode memMode, int iNumDevs, bool wc) {
|
||||||
|
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
|
||||||
|
printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
|
||||||
|
|
||||||
|
if (wc) {
|
||||||
|
printf(" Write-Combined Memory Writes are Enabled");
|
||||||
|
}
|
||||||
|
|
||||||
|
printf(" Transfer Size (Bytes)\tBandwidth(GB/s)\n");
|
||||||
|
unsigned int i;
|
||||||
|
|
||||||
|
for (i = 0; i < (count - 1); i++) {
|
||||||
|
printf(" %u\t\t\t%s%.1f\n", memSizes[i],
|
||||||
|
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i],
|
||||||
|
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// print results in a database format
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
|
||||||
|
unsigned int count, memcpyKind kind, memoryMode memMode,
|
||||||
|
int iNumDevs, bool wc) {
|
||||||
|
std::string sConfig;
|
||||||
|
|
||||||
|
// log config information
|
||||||
|
if (kind == DEVICE_TO_DEVICE) {
|
||||||
|
sConfig += "D2D";
|
||||||
|
} else {
|
||||||
|
if (kind == DEVICE_TO_HOST) {
|
||||||
|
sConfig += "D2H";
|
||||||
|
} else if (kind == HOST_TO_DEVICE) {
|
||||||
|
sConfig += "H2D";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (memMode == PAGEABLE) {
|
||||||
|
sConfig += "-Paged";
|
||||||
|
} else if (memMode == PINNED) {
|
||||||
|
sConfig += "-Pinned";
|
||||||
|
|
||||||
|
if (wc) {
|
||||||
|
sConfig += "-WriteCombined";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int i;
|
||||||
|
double dSeconds = 0.0;
|
||||||
|
|
||||||
|
for (i = 0; i < count; i++) {
|
||||||
|
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1 << 20));
|
||||||
|
printf(
|
||||||
|
"bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
|
||||||
|
"bytes, NumDevsUsed = %d\n",
|
||||||
|
sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Print help screen
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
void printHelp(void) {
|
||||||
|
printf("Usage: bandwidthTest [OPTION]...\n");
|
||||||
|
printf(
|
||||||
|
"Test the bandwidth for device to host, host to device, and device to "
|
||||||
|
"device transfers\n");
|
||||||
|
printf("\n");
|
||||||
|
printf(
|
||||||
|
"Example: measure the bandwidth of device to host pinned memory copies "
|
||||||
|
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
|
||||||
|
printf(
|
||||||
|
"./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
|
||||||
|
"--increment=1024 --dtoh\n");
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
printf("Options:\n");
|
||||||
|
printf("--help\tDisplay this help menu\n");
|
||||||
|
printf("--csv\tPrint results as a CSV\n");
|
||||||
|
printf("--device=[deviceno]\tSpecify the device device to be used\n");
|
||||||
|
printf(" all - compute cumulative bandwidth on all the devices\n");
|
||||||
|
printf(" 0,1,2,...,n - Specify any particular device to be used\n");
|
||||||
|
printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");
|
||||||
|
printf(" pageable - pageable memory\n");
|
||||||
|
printf(" pinned - non-pageable system memory\n");
|
||||||
|
printf("--mode=[MODE]\tSpecify the mode to use\n");
|
||||||
|
printf(" quick - performs a quick measurement\n");
|
||||||
|
printf(" range - measures a user-specified range of values\n");
|
||||||
|
printf(" shmoo - performs an intense shmoo of a large range of values\n");
|
||||||
|
|
||||||
|
printf("--htod\tMeasure host to device transfers\n");
|
||||||
|
printf("--dtoh\tMeasure device to host transfers\n");
|
||||||
|
printf("--dtod\tMeasure device to device transfers\n");
|
||||||
|
#if CUDART_VERSION >= 2020
|
||||||
|
printf("--wc\tAllocate pinned memory as write-combined\n");
|
||||||
|
#endif
|
||||||
|
printf("--cputiming\tForce CPU-based timing always\n");
|
||||||
|
|
||||||
|
printf("Range mode options\n");
|
||||||
|
printf("--start=[SIZE]\tStarting transfer size in bytes\n");
|
||||||
|
printf("--end=[SIZE]\tEnding transfer size in bytes\n");
|
||||||
|
printf("--increment=[SIZE]\tIncrement size in bytes\n");
|
||||||
|
}
|
20
Samples/bandwidthTest/bandwidthTest_vs2012.sln
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
|
# Visual Studio 2012
|
||||||
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|x64 = Debug|x64
|
||||||
|
Release|x64 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
107
Samples/bandwidthTest/bandwidthTest_vs2012.vcxproj
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<PropertyGroup>
|
||||||
|
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
|
<ProjectConfiguration Include="Debug|x64">
|
||||||
|
<Configuration>Debug</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Release|x64">
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
</ItemGroup>
|
||||||
|
<PropertyGroup Label="Globals">
|
||||||
|
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
|
||||||
|
<RootNamespace>bandwidthTest_vs2012</RootNamespace>
|
||||||
|
<ProjectName>bandwidthTest</ProjectName>
|
||||||
|
<CudaToolkitCustomDir />
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
|
<PlatformToolset>v110</PlatformToolset>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
|
<ImportGroup Label="ExtensionSettings">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<ImportGroup Label="PropertySheets">
|
||||||
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<PropertyGroup Label="UserMacros" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<IntDir>$(Platform)/$(Configuration)/</IntDir>
|
||||||
|
<IncludePath>$(IncludePath)</IncludePath>
|
||||||
|
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
|
||||||
|
<CodeAnalysisRules />
|
||||||
|
<CodeAnalysisRuleAssemblies />
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Platform)'=='x64'">
|
||||||
|
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemDefinitionGroup>
|
||||||
|
<ClCompile>
|
||||||
|
<WarningLevel>Level3</WarningLevel>
|
||||||
|
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<SubSystem>Console</SubSystem>
|
||||||
|
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||||
|
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
|
||||||
|
<OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
|
||||||
|
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<Include>./;../../Common</Include>
|
||||||
|
<Defines>WIN32</Defines>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>Disabled</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MTd</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>MaxSpeed</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>false</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MT</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<CudaCompile Include="bandwidthTest.cu" />
|
||||||
|
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
|
</ImportGroup>
|
||||||
|
</Project>
|
20
Samples/bandwidthTest/bandwidthTest_vs2013.sln
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 13.00
|
||||||
|
# Visual Studio 2013
|
||||||
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|x64 = Debug|x64
|
||||||
|
Release|x64 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
107
Samples/bandwidthTest/bandwidthTest_vs2013.vcxproj
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<PropertyGroup>
|
||||||
|
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
|
<ProjectConfiguration Include="Debug|x64">
|
||||||
|
<Configuration>Debug</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Release|x64">
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
</ItemGroup>
|
||||||
|
<PropertyGroup Label="Globals">
|
||||||
|
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
|
||||||
|
<RootNamespace>bandwidthTest_vs2013</RootNamespace>
|
||||||
|
<ProjectName>bandwidthTest</ProjectName>
|
||||||
|
<CudaToolkitCustomDir />
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
|
<PlatformToolset>v120</PlatformToolset>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
|
<ImportGroup Label="ExtensionSettings">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<ImportGroup Label="PropertySheets">
|
||||||
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<PropertyGroup Label="UserMacros" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<IntDir>$(Platform)/$(Configuration)/</IntDir>
|
||||||
|
<IncludePath>$(IncludePath)</IncludePath>
|
||||||
|
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
|
||||||
|
<CodeAnalysisRules />
|
||||||
|
<CodeAnalysisRuleAssemblies />
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Platform)'=='x64'">
|
||||||
|
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemDefinitionGroup>
|
||||||
|
<ClCompile>
|
||||||
|
<WarningLevel>Level3</WarningLevel>
|
||||||
|
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<SubSystem>Console</SubSystem>
|
||||||
|
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||||
|
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
|
||||||
|
<OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
|
||||||
|
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<Include>./;../../Common</Include>
|
||||||
|
<Defines>WIN32</Defines>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>Disabled</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MTd</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>MaxSpeed</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>false</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MT</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<CudaCompile Include="bandwidthTest.cu" />
|
||||||
|
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
|
</ImportGroup>
|
||||||
|
</Project>
|
20
Samples/bandwidthTest/bandwidthTest_vs2015.sln
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 14.00
|
||||||
|
# Visual Studio 2015
|
||||||
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|x64 = Debug|x64
|
||||||
|
Release|x64 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
107
Samples/bandwidthTest/bandwidthTest_vs2015.vcxproj
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<PropertyGroup>
|
||||||
|
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
|
<ProjectConfiguration Include="Debug|x64">
|
||||||
|
<Configuration>Debug</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Release|x64">
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
</ItemGroup>
|
||||||
|
<PropertyGroup Label="Globals">
|
||||||
|
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
|
||||||
|
<RootNamespace>bandwidthTest_vs2015</RootNamespace>
|
||||||
|
<ProjectName>bandwidthTest</ProjectName>
|
||||||
|
<CudaToolkitCustomDir />
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
|
<PlatformToolset>v140</PlatformToolset>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
|
<ImportGroup Label="ExtensionSettings">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<ImportGroup Label="PropertySheets">
|
||||||
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<PropertyGroup Label="UserMacros" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<IntDir>$(Platform)/$(Configuration)/</IntDir>
|
||||||
|
<IncludePath>$(IncludePath)</IncludePath>
|
||||||
|
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
|
||||||
|
<CodeAnalysisRules />
|
||||||
|
<CodeAnalysisRuleAssemblies />
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Platform)'=='x64'">
|
||||||
|
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemDefinitionGroup>
|
||||||
|
<ClCompile>
|
||||||
|
<WarningLevel>Level3</WarningLevel>
|
||||||
|
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<SubSystem>Console</SubSystem>
|
||||||
|
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||||
|
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
|
||||||
|
<OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
|
||||||
|
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<Include>./;../../Common</Include>
|
||||||
|
<Defines>WIN32</Defines>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>Disabled</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MTd</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>MaxSpeed</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>false</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MT</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<CudaCompile Include="bandwidthTest.cu" />
|
||||||
|
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
|
</ImportGroup>
|
||||||
|
</Project>
|
20
Samples/bandwidthTest/bandwidthTest_vs2017.sln
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
|
# Visual Studio 2017
|
||||||
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|x64 = Debug|x64
|
||||||
|
Release|x64 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
108
Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<PropertyGroup>
|
||||||
|
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
|
<ProjectConfiguration Include="Debug|x64">
|
||||||
|
<Configuration>Debug</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Release|x64">
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
</ItemGroup>
|
||||||
|
<PropertyGroup Label="Globals">
|
||||||
|
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
|
||||||
|
<RootNamespace>bandwidthTest_vs2017</RootNamespace>
|
||||||
|
<ProjectName>bandwidthTest</ProjectName>
|
||||||
|
<CudaToolkitCustomDir />
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
|
<PlatformToolset>v141</PlatformToolset>
|
||||||
|
<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
|
<ImportGroup Label="ExtensionSettings">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<ImportGroup Label="PropertySheets">
|
||||||
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<PropertyGroup Label="UserMacros" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<IntDir>$(Platform)/$(Configuration)/</IntDir>
|
||||||
|
<IncludePath>$(IncludePath)</IncludePath>
|
||||||
|
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
|
||||||
|
<CodeAnalysisRules />
|
||||||
|
<CodeAnalysisRuleAssemblies />
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Platform)'=='x64'">
|
||||||
|
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemDefinitionGroup>
|
||||||
|
<ClCompile>
|
||||||
|
<WarningLevel>Level3</WarningLevel>
|
||||||
|
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<SubSystem>Console</SubSystem>
|
||||||
|
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||||
|
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
|
||||||
|
<OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
|
||||||
|
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<Include>./;../../Common</Include>
|
||||||
|
<Defines>WIN32</Defines>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>Disabled</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MTd</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>MaxSpeed</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>false</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MT</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<CudaCompile Include="bandwidthTest.cu" />
|
||||||
|
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
|
</ImportGroup>
|
||||||
|
</Project>
|
|
@ -246,7 +246,11 @@ LIBRARIES :=
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
# Gencode arguments
|
# Gencode arguments
|
||||||
|
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
|
||||||
|
SMS ?= 30 35 37 50 52 60 61 70 72 75
|
||||||
|
else
|
||||||
SMS ?= 30 35 37 50 52 60 61 70 75
|
SMS ?= 30 35 37 50 52 60 61 70 75
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(SMS),)
|
ifeq ($(SMS),)
|
||||||
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||||
|
@ -264,7 +268,7 @@ GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
LIBRARIES += -lcublas_static -lcusparse_static -lculibos
|
LIBRARIES += -lcublas_static -lcublasLt_static -lcusparse_static -lculibos
|
||||||
|
|
||||||
ifeq ($(SAMPLE_ENABLED),0)
|
ifeq ($(SAMPLE_ENABLED),0)
|
||||||
EXEC ?= @echo "[@]"
|
EXEC ?= @echo "[@]"
|
||||||
|
|
|
@ -31,6 +31,7 @@
|
||||||
</keywords>
|
</keywords>
|
||||||
<libraries>
|
<libraries>
|
||||||
<library>cublas_static</library>
|
<library>cublas_static</library>
|
||||||
|
<library>cublasLt_static</library>
|
||||||
<library>cusparse_static</library>
|
<library>cusparse_static</library>
|
||||||
<library>culibos</library>
|
<library>culibos</library>
|
||||||
</libraries>
|
</libraries>
|
||||||
|
@ -55,6 +56,7 @@
|
||||||
<sm-arch>sm60</sm-arch>
|
<sm-arch>sm60</sm-arch>
|
||||||
<sm-arch>sm61</sm-arch>
|
<sm-arch>sm61</sm-arch>
|
||||||
<sm-arch>sm70</sm-arch>
|
<sm-arch>sm70</sm-arch>
|
||||||
|
<sm-arch>sm72</sm-arch>
|
||||||
<sm-arch>sm75</sm-arch>
|
<sm-arch>sm75</sm-arch>
|
||||||
<supported_envs>
|
<supported_envs>
|
||||||
<env>
|
<env>
|
||||||
|
|
|
@ -10,7 +10,7 @@ Linear Algebra, CUBLAS Library, CUSPARSE Library
|
||||||
|
|
||||||
## Supported SM Architectures
|
## Supported SM Architectures
|
||||||
|
|
||||||
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
## Supported OSes
|
## Supported OSes
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch,
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -323,7 +323,7 @@ int main(int argc, char **argv) {
|
||||||
checkCudaErrors(cudaStreamCreate(&streamForGraph));
|
checkCudaErrors(cudaStreamCreate(&streamForGraph));
|
||||||
checkCudaErrors(cublasSetStream(cublasHandle, stream1));
|
checkCudaErrors(cublasSetStream(cublasHandle, stream1));
|
||||||
checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
|
checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
|
||||||
checkCudaErrors(cudaStreamBeginCapture(stream1));
|
checkCudaErrors(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal));
|
||||||
|
|
||||||
r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b);
|
r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b);
|
||||||
cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
|
cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -102,6 +102,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -102,6 +102,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -102,6 +102,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -103,6 +103,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -264,7 +264,11 @@ LIBRARIES :=
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
# Gencode arguments
|
# Gencode arguments
|
||||||
|
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
|
||||||
|
SMS ?= 60 61 70 72 75
|
||||||
|
else
|
||||||
SMS ?= 60 61 70 75
|
SMS ?= 60 61 70 75
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(SMS),)
|
ifeq ($(SMS),)
|
||||||
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||||
|
|
|
@ -42,6 +42,7 @@
|
||||||
<sm-arch>sm60</sm-arch>
|
<sm-arch>sm60</sm-arch>
|
||||||
<sm-arch>sm61</sm-arch>
|
<sm-arch>sm61</sm-arch>
|
||||||
<sm-arch>sm70</sm-arch>
|
<sm-arch>sm70</sm-arch>
|
||||||
|
<sm-arch>sm72</sm-arch>
|
||||||
<sm-arch>sm75</sm-arch>
|
<sm-arch>sm75</sm-arch>
|
||||||
<supported_envs>
|
<supported_envs>
|
||||||
<env>
|
<env>
|
||||||
|
|
|
@ -10,7 +10,7 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiBlock Cooperative Group
|
||||||
|
|
||||||
## Supported SM Architectures
|
## Supported SM Architectures
|
||||||
|
|
||||||
[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
## Supported OSes
|
## Supported OSes
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ x86_64, ppc64le
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -103,6 +103,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -103,6 +103,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -103,6 +103,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -264,7 +264,11 @@ LIBRARIES :=
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
# Gencode arguments
|
# Gencode arguments
|
||||||
|
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
|
||||||
|
SMS ?= 60 61 70 72 75
|
||||||
|
else
|
||||||
SMS ?= 60 61 70 75
|
SMS ?= 60 61 70 75
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(SMS),)
|
ifeq ($(SMS),)
|
||||||
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||||
|
|
|
@ -49,6 +49,7 @@
|
||||||
<sm-arch>sm60</sm-arch>
|
<sm-arch>sm60</sm-arch>
|
||||||
<sm-arch>sm61</sm-arch>
|
<sm-arch>sm61</sm-arch>
|
||||||
<sm-arch>sm70</sm-arch>
|
<sm-arch>sm70</sm-arch>
|
||||||
|
<sm-arch>sm72</sm-arch>
|
||||||
<sm-arch>sm75</sm-arch>
|
<sm-arch>sm75</sm-arch>
|
||||||
<supported_envs>
|
<supported_envs>
|
||||||
<env>
|
<env>
|
||||||
|
|
|
@ -10,7 +10,7 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiDevice Cooperative Grou
|
||||||
|
|
||||||
## Supported SM Architectures
|
## Supported SM Architectures
|
||||||
|
|
||||||
[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
[SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
## Supported OSes
|
## Supported OSes
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ cudaMemAdvise, cudaMemPrefetchAsync, cudaLaunchCooperativeKernelMultiDevice, cud
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
|
@ -415,7 +415,7 @@ void getIdenticalGPUs(int num_of_gpus, std::set<int> &identicalGPUs) {
|
||||||
identicalGPUs.erase(it);
|
identicalGPUs.erase(it);
|
||||||
}
|
}
|
||||||
if (!deviceProp.cooperativeMultiDeviceLaunch ||
|
if (!deviceProp.cooperativeMultiDeviceLaunch ||
|
||||||
!deviceProp.concurrentManagedAccess) {
|
!deviceProp.managedMemory) {
|
||||||
identicalGPUs.erase(it);
|
identicalGPUs.erase(it);
|
||||||
}
|
}
|
||||||
it++;
|
it++;
|
||||||
|
@ -450,8 +450,7 @@ int main(int argc, char **argv) {
|
||||||
if (identicalGPUs.size() <= 1) {
|
if (identicalGPUs.size() <= 1) {
|
||||||
printf(
|
printf(
|
||||||
"No Two or more GPUs with same architecture capable of "
|
"No Two or more GPUs with same architecture capable of "
|
||||||
"cooperativeMultiDeviceLaunch & concurrentManagedAccess found. "
|
"cooperativeMultiDeviceLaunch & managedMemory found. \nWaiving the sample\n");
|
||||||
"\nWaiving the sample\n");
|
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -103,6 +103,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -103,6 +103,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -103,6 +103,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -104,6 +104,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -246,12 +246,6 @@ ifeq ($(TARGET_ARCH),armv7l)
|
||||||
SAMPLE_ENABLED := 0
|
SAMPLE_ENABLED := 0
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# This sample is not supported on aarch64
|
|
||||||
ifeq ($(TARGET_ARCH),aarch64)
|
|
||||||
$(info >>> WARNING - cudaTensorCoreGemm is not supported on aarch64 - waiving sample <<<)
|
|
||||||
SAMPLE_ENABLED := 0
|
|
||||||
endif
|
|
||||||
|
|
||||||
ALL_LDFLAGS :=
|
ALL_LDFLAGS :=
|
||||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||||
|
@ -264,7 +258,11 @@ LIBRARIES :=
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
# Gencode arguments
|
# Gencode arguments
|
||||||
|
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
|
||||||
|
SMS ?= 70 72 75
|
||||||
|
else
|
||||||
SMS ?= 70 75
|
SMS ?= 70 75
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(SMS),)
|
ifeq ($(SMS),)
|
||||||
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||||
|
|
|
@ -43,12 +43,16 @@ In addition to that, it demonstrates the use of the new CUDA function attribute
|
||||||
<scope>1:CUDA Basic Topics</scope>
|
<scope>1:CUDA Basic Topics</scope>
|
||||||
</scopes>
|
</scopes>
|
||||||
<sm-arch>sm70</sm-arch>
|
<sm-arch>sm70</sm-arch>
|
||||||
|
<sm-arch>sm72</sm-arch>
|
||||||
<sm-arch>sm75</sm-arch>
|
<sm-arch>sm75</sm-arch>
|
||||||
<supported_envs>
|
<supported_envs>
|
||||||
<env>
|
<env>
|
||||||
<arch>x86_64</arch>
|
<arch>x86_64</arch>
|
||||||
<platform>linux</platform>
|
<platform>linux</platform>
|
||||||
</env>
|
</env>
|
||||||
|
<env>
|
||||||
|
<arch>aarch64</arch>
|
||||||
|
</env>
|
||||||
<env>
|
<env>
|
||||||
<platform>windows7</platform>
|
<platform>windows7</platform>
|
||||||
</env>
|
</env>
|
||||||
|
|
|
@ -14,7 +14,7 @@ Matrix Multiply, WMMA, Tensor Cores
|
||||||
|
|
||||||
## Supported SM Architectures
|
## Supported SM Architectures
|
||||||
|
|
||||||
[SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
[SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
## Supported OSes
|
## Supported OSes
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ Linux, Windows
|
||||||
|
|
||||||
## Supported CPU Architecture
|
## Supported CPU Architecture
|
||||||
|
|
||||||
x86_64, ppc64le
|
x86_64, ppc64le, aarch64
|
||||||
|
|
||||||
## CUDA APIs involved
|
## CUDA APIs involved
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
@ -52,9 +52,9 @@ $ cd <sample_dir>
|
||||||
$ make
|
$ make
|
||||||
```
|
```
|
||||||
The samples makefiles can take advantage of certain options:
|
The samples makefiles can take advantage of certain options:
|
||||||
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le.
|
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
|
||||||
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
|
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
|
||||||
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/>
|
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
|
||||||
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
|
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
|
||||||
* **dbg=1** - build with debug symbols
|
* **dbg=1** - build with debug symbols
|
||||||
```
|
```
|
||||||
|
|
|
@ -180,16 +180,16 @@
|
||||||
|
|
||||||
using namespace nvcuda;
|
using namespace nvcuda;
|
||||||
|
|
||||||
__host__ void init_host_matrices(float *a, float *b, float *c) {
|
__host__ void init_host_matrices(half *a, half *b, float *c) {
|
||||||
for (int i = 0; i < M_GLOBAL; i++) {
|
for (int i = 0; i < M_GLOBAL; i++) {
|
||||||
for (int j = 0; j < K_GLOBAL; j++) {
|
for (int j = 0; j < K_GLOBAL; j++) {
|
||||||
a[i * K_GLOBAL + j] = static_cast<float>(rand() % 3);
|
a[i * K_GLOBAL + j] = (half)(rand() % 3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < N_GLOBAL; i++) {
|
for (int i = 0; i < N_GLOBAL; i++) {
|
||||||
for (int j = 0; j < K_GLOBAL; j++) {
|
for (int j = 0; j < K_GLOBAL; j++) {
|
||||||
b[i * K_GLOBAL + j] = static_cast<float>(rand() % 3);
|
b[i * K_GLOBAL + j] = (half)(rand() % 3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -198,26 +198,6 @@ __host__ void init_host_matrices(float *a, float *b, float *c) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void init_device_matrices(const float *A_h, const float *B_h,
|
|
||||||
const float *C_h, half *A, half *B,
|
|
||||||
float *C, float *D) {
|
|
||||||
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M_GLOBAL * K_GLOBAL;
|
|
||||||
i += gridDim.x * blockDim.x)
|
|
||||||
A[i] = __float2half(A_h[i]);
|
|
||||||
|
|
||||||
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < N_GLOBAL * K_GLOBAL;
|
|
||||||
i += gridDim.x * blockDim.x)
|
|
||||||
B[i] = __float2half(B_h[i]);
|
|
||||||
|
|
||||||
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M_GLOBAL * N_GLOBAL;
|
|
||||||
i += gridDim.x * blockDim.x)
|
|
||||||
C[i] = C_h[i];
|
|
||||||
|
|
||||||
for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M_GLOBAL * N_GLOBAL;
|
|
||||||
i += gridDim.x * blockDim.x)
|
|
||||||
D[i] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
__global__ void compute_gemm(const half *A, const half *B, const float *C,
|
__global__ void compute_gemm(const half *A, const half *B, const float *C,
|
||||||
float *D, float alpha, float beta) {
|
float *D, float alpha, float beta) {
|
||||||
extern __shared__ half shmem[][CHUNK_K * K + SKEW_HALF];
|
extern __shared__ half shmem[][CHUNK_K * K + SKEW_HALF];
|
||||||
|
@ -486,7 +466,7 @@ __global__ void simple_wmma_gemm(half *a, half *b, float *c, float *d, int m_ld,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void matMultiplyOnHost(float *A, float *B, float *C, float alpha,
|
__host__ void matMultiplyOnHost(half *A, half *B, float *C, float alpha,
|
||||||
float beta, int numARows, int numAColumns,
|
float beta, int numARows, int numAColumns,
|
||||||
int numBRows, int numBColumns, int numCRows,
|
int numBRows, int numBColumns, int numCRows,
|
||||||
int numCColumns) {
|
int numCColumns) {
|
||||||
|
@ -495,7 +475,7 @@ __host__ void matMultiplyOnHost(float *A, float *B, float *C, float alpha,
|
||||||
float temp = 0.0;
|
float temp = 0.0;
|
||||||
|
|
||||||
for (int k = 0; k < numAColumns; k++) {
|
for (int k = 0; k < numAColumns; k++) {
|
||||||
temp += A[i * numAColumns + k] * B[j * numBRows + k];
|
temp += (float)A[i * numAColumns + k] * (float)B[j * numBRows + k];
|
||||||
}
|
}
|
||||||
|
|
||||||
C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
|
C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
|
||||||
|
@ -514,7 +494,7 @@ int main(int argc, char **argv) {
|
||||||
// Tensor cores require a GPU of Volta (SM7X) architecture or higher.
|
// Tensor cores require a GPU of Volta (SM7X) architecture or higher.
|
||||||
if (deviceProp.major < 7) {
|
if (deviceProp.major < 7) {
|
||||||
printf(
|
printf(
|
||||||
"cudaTensorCoreGemm requires requires SM 7.0 or higher to use Tensor "
|
"cudaTensorCoreGemm requires SM 7.0 or higher to use Tensor "
|
||||||
"Cores. Exiting...\n");
|
"Cores. Exiting...\n");
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
}
|
||||||
|
@ -523,25 +503,20 @@ int main(int argc, char **argv) {
|
||||||
printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
|
printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
|
||||||
printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
|
printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
|
||||||
|
|
||||||
float *A_h = NULL;
|
half *A_h = NULL;
|
||||||
float *B_h = NULL;
|
half *B_h = NULL;
|
||||||
float *C_h = NULL;
|
float *C_h = NULL;
|
||||||
#if CPU_DEBUG
|
#if CPU_DEBUG
|
||||||
float *result_hD = NULL;
|
float *result_hD = NULL;
|
||||||
float *result_host = NULL;
|
float *result_host = NULL;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&A_h),
|
A_h = (half *)malloc(sizeof(half) * M_GLOBAL * K_GLOBAL);
|
||||||
sizeof(float) * M_GLOBAL * K_GLOBAL));
|
B_h = (half *)malloc(sizeof(half) * K_GLOBAL * N_GLOBAL);
|
||||||
checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&B_h),
|
C_h = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
|
||||||
sizeof(float) * K_GLOBAL * N_GLOBAL));
|
|
||||||
checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&C_h),
|
|
||||||
sizeof(float) * M_GLOBAL * N_GLOBAL));
|
|
||||||
#if CPU_DEBUG
|
#if CPU_DEBUG
|
||||||
checkCudaErrors(cudaMallocManaged((void **)&result_hD,
|
result_hD = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
|
||||||
sizeof(float) * M_GLOBAL * N_GLOBAL));
|
result_host = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
|
||||||
checkCudaErrors(cudaMallocManaged((void **)&result_host,
|
|
||||||
sizeof(float) * M_GLOBAL * N_GLOBAL));
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
half *A = NULL;
|
half *A = NULL;
|
||||||
|
@ -567,11 +542,13 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
printf("Preparing data for GPU...\n");
|
printf("Preparing data for GPU...\n");
|
||||||
|
|
||||||
checkKernelErrors(
|
checkCudaErrors(cudaMemcpy(A, A_h, sizeof(half) * M_GLOBAL * K_GLOBAL,
|
||||||
(init_device_matrices<<<deviceProp.multiProcessorCount,
|
cudaMemcpyHostToDevice));
|
||||||
THREADS_PER_BLOCK>>>(A_h, B_h, C_h, A, B, C, D)));
|
checkCudaErrors(cudaMemcpy(B, B_h, sizeof(half) * N_GLOBAL * K_GLOBAL,
|
||||||
|
cudaMemcpyHostToDevice));
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaMemcpy(C, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL,
|
||||||
|
cudaMemcpyHostToDevice));
|
||||||
|
checkCudaErrors(cudaMemset(D, 0, sizeof(float) * M_GLOBAL * N_GLOBAL));
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
// Compute the right amount of shared memory to request.
|
// Compute the right amount of shared memory to request.
|
||||||
|
@ -650,6 +627,8 @@ int main(int argc, char **argv) {
|
||||||
printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i],
|
printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i],
|
||||||
result_host[i]);
|
result_host[i]);
|
||||||
}
|
}
|
||||||
|
free(result_hD);
|
||||||
|
free(result_host);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
float milliseconds = 0;
|
float milliseconds = 0;
|
||||||
|
@ -662,9 +641,9 @@ int main(int argc, char **argv) {
|
||||||
(milliseconds / 1000.)) /
|
(milliseconds / 1000.)) /
|
||||||
1e12);
|
1e12);
|
||||||
|
|
||||||
checkCudaErrors(cudaFree(reinterpret_cast<void *>(A_h)));
|
free(A_h);
|
||||||
checkCudaErrors(cudaFree(reinterpret_cast<void *>(B_h)));
|
free(B_h);
|
||||||
checkCudaErrors(cudaFree(reinterpret_cast<void *>(C_h)));
|
free(C_h);
|
||||||
checkCudaErrors(cudaFree(reinterpret_cast<void *>(A)));
|
checkCudaErrors(cudaFree(reinterpret_cast<void *>(A)));
|
||||||
checkCudaErrors(cudaFree(reinterpret_cast<void *>(B)));
|
checkCudaErrors(cudaFree(reinterpret_cast<void *>(B)));
|
||||||
checkCudaErrors(cudaFree(reinterpret_cast<void *>(C)));
|
checkCudaErrors(cudaFree(reinterpret_cast<void *>(C)));
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -102,6 +102,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -102,6 +102,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -102,6 +102,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -103,6 +103,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -246,7 +246,11 @@ LIBRARIES :=
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
# Gencode arguments
|
# Gencode arguments
|
||||||
|
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
|
||||||
|
SMS ?= 30 35 37 50 52 60 61 70 72 75
|
||||||
|
else
|
||||||
SMS ?= 30 35 37 50 52 60 61 70 75
|
SMS ?= 30 35 37 50 52 60 61 70 75
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(SMS),)
|
ifeq ($(SMS),)
|
||||||
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||||
|
|
|
@ -39,6 +39,7 @@
|
||||||
<sm-arch>sm60</sm-arch>
|
<sm-arch>sm60</sm-arch>
|
||||||
<sm-arch>sm61</sm-arch>
|
<sm-arch>sm61</sm-arch>
|
||||||
<sm-arch>sm70</sm-arch>
|
<sm-arch>sm70</sm-arch>
|
||||||
|
<sm-arch>sm72</sm-arch>
|
||||||
<sm-arch>sm75</sm-arch>
|
<sm-arch>sm75</sm-arch>
|
||||||
<supported_envs>
|
<supported_envs>
|
||||||
<env>
|
<env>
|
||||||
|
|
|
@ -10,7 +10,7 @@ CUDA Runtime API, Device Query
|
||||||
|
|
||||||
## Supported SM Architectures
|
## Supported SM Architectures
|
||||||
|
|
||||||
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
## Supported OSes
|
## Supported OSes
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -102,6 +102,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -102,6 +102,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -102,6 +102,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -103,6 +103,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
318
Samples/immaTensorCoreGemm/Makefile
Normal file
|
@ -0,0 +1,318 @@
|
||||||
|
################################################################################
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions
|
||||||
|
# are met:
|
||||||
|
# * Redistributions of source code must retain the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer in the
|
||||||
|
# documentation and/or other materials provided with the distribution.
|
||||||
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||||
|
# contributors may be used to endorse or promote products derived
|
||||||
|
# from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||||
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||||
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||||
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
#
|
||||||
|
################################################################################
|
||||||
|
#
|
||||||
|
# Makefile project only supported on Mac OS X and Linux Platforms)
|
||||||
|
#
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Location of the CUDA Toolkit
|
||||||
|
CUDA_PATH ?= /usr/local/cuda
|
||||||
|
|
||||||
|
##############################
|
||||||
|
# start deprecated interface #
|
||||||
|
##############################
|
||||||
|
ifeq ($(x86_64),1)
|
||||||
|
$(info WARNING - x86_64 variable has been deprecated)
|
||||||
|
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
|
||||||
|
TARGET_ARCH ?= x86_64
|
||||||
|
endif
|
||||||
|
ifeq ($(ARMv7),1)
|
||||||
|
$(info WARNING - ARMv7 variable has been deprecated)
|
||||||
|
$(info WARNING - please use TARGET_ARCH=armv7l instead)
|
||||||
|
TARGET_ARCH ?= armv7l
|
||||||
|
endif
|
||||||
|
ifeq ($(aarch64),1)
|
||||||
|
$(info WARNING - aarch64 variable has been deprecated)
|
||||||
|
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
|
||||||
|
TARGET_ARCH ?= aarch64
|
||||||
|
endif
|
||||||
|
ifeq ($(ppc64le),1)
|
||||||
|
$(info WARNING - ppc64le variable has been deprecated)
|
||||||
|
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
|
||||||
|
TARGET_ARCH ?= ppc64le
|
||||||
|
endif
|
||||||
|
ifneq ($(GCC),)
|
||||||
|
$(info WARNING - GCC variable has been deprecated)
|
||||||
|
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
|
||||||
|
HOST_COMPILER ?= $(GCC)
|
||||||
|
endif
|
||||||
|
ifneq ($(abi),)
|
||||||
|
$(error ERROR - abi variable has been removed)
|
||||||
|
endif
|
||||||
|
############################
|
||||||
|
# end deprecated interface #
|
||||||
|
############################
|
||||||
|
|
||||||
|
# architecture
|
||||||
|
HOST_ARCH := $(shell uname -m)
|
||||||
|
TARGET_ARCH ?= $(HOST_ARCH)
|
||||||
|
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
|
||||||
|
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||||
|
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
|
||||||
|
TARGET_SIZE := 64
|
||||||
|
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
|
||||||
|
TARGET_SIZE := 32
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
TARGET_SIZE := $(shell getconf LONG_BIT)
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
|
||||||
|
endif
|
||||||
|
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||||
|
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
|
||||||
|
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
|
||||||
|
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
|
||||||
|
TARGET_ARCH = armv7l
|
||||||
|
endif
|
||||||
|
|
||||||
|
# operating system
|
||||||
|
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
|
||||||
|
TARGET_OS ?= $(HOST_OS)
|
||||||
|
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
|
||||||
|
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
|
||||||
|
endif
|
||||||
|
|
||||||
|
# host compiler
|
||||||
|
ifeq ($(TARGET_OS),darwin)
|
||||||
|
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
|
||||||
|
HOST_COMPILER ?= clang++
|
||||||
|
endif
|
||||||
|
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||||
|
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
|
||||||
|
ifeq ($(TARGET_OS),linux)
|
||||||
|
HOST_COMPILER ?= arm-linux-gnueabihf-g++
|
||||||
|
else ifeq ($(TARGET_OS),qnx)
|
||||||
|
ifeq ($(QNX_HOST),)
|
||||||
|
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||||
|
endif
|
||||||
|
ifeq ($(QNX_TARGET),)
|
||||||
|
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||||
|
endif
|
||||||
|
export QNX_HOST
|
||||||
|
export QNX_TARGET
|
||||||
|
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
|
||||||
|
else ifeq ($(TARGET_OS),android)
|
||||||
|
HOST_COMPILER ?= arm-linux-androideabi-g++
|
||||||
|
endif
|
||||||
|
else ifeq ($(TARGET_ARCH),aarch64)
|
||||||
|
ifeq ($(TARGET_OS), linux)
|
||||||
|
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||||
|
else ifeq ($(TARGET_OS),qnx)
|
||||||
|
ifeq ($(QNX_HOST),)
|
||||||
|
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||||
|
endif
|
||||||
|
ifeq ($(QNX_TARGET),)
|
||||||
|
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||||
|
endif
|
||||||
|
export QNX_HOST
|
||||||
|
export QNX_TARGET
|
||||||
|
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
|
||||||
|
else ifeq ($(TARGET_OS), android)
|
||||||
|
HOST_COMPILER ?= aarch64-linux-android-clang++
|
||||||
|
endif
|
||||||
|
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||||
|
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
HOST_COMPILER ?= g++
|
||||||
|
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
|
||||||
|
|
||||||
|
# internal flags
|
||||||
|
NVCCFLAGS := -m${TARGET_SIZE}
|
||||||
|
CCFLAGS :=
|
||||||
|
LDFLAGS :=
|
||||||
|
|
||||||
|
# build flags
|
||||||
|
ifeq ($(TARGET_OS),darwin)
|
||||||
|
LDFLAGS += -rpath $(CUDA_PATH)/lib
|
||||||
|
CCFLAGS += -arch $(HOST_ARCH)
|
||||||
|
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
|
||||||
|
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
|
||||||
|
CCFLAGS += -mfloat-abi=hard
|
||||||
|
else ifeq ($(TARGET_OS),android)
|
||||||
|
LDFLAGS += -pie
|
||||||
|
CCFLAGS += -fpie -fpic -fexceptions
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||||
|
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||||
|
ifneq ($(TARGET_FS),)
|
||||||
|
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||||
|
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||||
|
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||||
|
endif
|
||||||
|
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||||
|
ifneq ($(TARGET_FS),)
|
||||||
|
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||||
|
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||||
|
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||||
|
endif
|
||||||
|
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
|
||||||
|
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
|
||||||
|
CCFLAGS += -isystem=$(TARGET_FS)/usr/include
|
||||||
|
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET_OS),qnx)
|
||||||
|
CCFLAGS += -DWIN_INTERFACE_CUSTOM
|
||||||
|
LDFLAGS += -lsocket
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Install directory of different arch
|
||||||
|
CUDA_INSTALL_TARGET_DIR :=
|
||||||
|
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
|
||||||
|
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Debug build flags
|
||||||
|
ifeq ($(dbg),1)
|
||||||
|
NVCCFLAGS += -g -G
|
||||||
|
BUILD_TYPE := debug
|
||||||
|
else
|
||||||
|
BUILD_TYPE := release
|
||||||
|
endif
|
||||||
|
|
||||||
|
ALL_CCFLAGS :=
|
||||||
|
ALL_CCFLAGS += $(NVCCFLAGS)
|
||||||
|
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
|
||||||
|
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
|
||||||
|
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||||
|
|
||||||
|
SAMPLE_ENABLED := 1
|
||||||
|
|
||||||
|
# This sample is not supported on Mac OSX
|
||||||
|
ifeq ($(TARGET_OS),darwin)
|
||||||
|
$(info >>> WARNING - immaTensorCoreGemm is not supported on Mac OSX - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
|
# This sample is not supported on ARMv7
|
||||||
|
ifeq ($(TARGET_ARCH),armv7l)
|
||||||
|
$(info >>> WARNING - immaTensorCoreGemm is not supported on ARMv7 - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
|
ALL_LDFLAGS :=
|
||||||
|
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||||
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||||
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
|
||||||
|
|
||||||
|
# Common includes and paths for CUDA
|
||||||
|
INCLUDES := -I../../Common
|
||||||
|
LIBRARIES :=
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Gencode arguments
|
||||||
|
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
|
||||||
|
SMS ?= 72 75
|
||||||
|
else
|
||||||
|
SMS ?= 75
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(SMS),)
|
||||||
|
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(GENCODE_FLAGS),)
|
||||||
|
# Generate SASS code for each SM architecture listed in $(SMS)
|
||||||
|
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
|
||||||
|
|
||||||
|
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
|
||||||
|
HIGHEST_SM := $(lastword $(sort $(SMS)))
|
||||||
|
ifneq ($(HIGHEST_SM),)
|
||||||
|
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ALL_CCFLAGS += -maxrregcount=255
|
||||||
|
|
||||||
|
ifeq ($(SAMPLE_ENABLED),0)
|
||||||
|
EXEC ?= @echo "[@]"
|
||||||
|
endif
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Target rules
|
||||||
|
all: build
|
||||||
|
|
||||||
|
build: immaTensorCoreGemm
|
||||||
|
|
||||||
|
check.deps:
|
||||||
|
ifeq ($(SAMPLE_ENABLED),0)
|
||||||
|
@echo "Sample will be waived due to the above missing dependencies"
|
||||||
|
else
|
||||||
|
@echo "Sample is ready - all dependencies have been met"
|
||||||
|
endif
|
||||||
|
|
||||||
|
immaTensorCoreGemm.o:immaTensorCoreGemm.cu
|
||||||
|
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
|
||||||
|
|
||||||
|
immaTensorCoreGemm: immaTensorCoreGemm.o
|
||||||
|
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
|
||||||
|
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||||
|
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||||
|
|
||||||
|
run: build
|
||||||
|
$(EXEC) ./immaTensorCoreGemm
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f immaTensorCoreGemm immaTensorCoreGemm.o
|
||||||
|
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/immaTensorCoreGemm
|
||||||
|
|
||||||
|
clobber: clean
|
64
Samples/immaTensorCoreGemm/NsightEclipse.xml
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
|
||||||
|
<entry>
|
||||||
|
<name>immaTensorCoreGemm</name>
|
||||||
|
<cflags>
|
||||||
|
<flag>-maxrregcount=255</flag>
|
||||||
|
</cflags>
|
||||||
|
<cuda_api_list>
|
||||||
|
<toolkit>cudaMallocManaged</toolkit>
|
||||||
|
<toolkit>cudaDeviceSynchronize</toolkit>
|
||||||
|
<toolkit>cudaFuncSetAttribute</toolkit>
|
||||||
|
<toolkit>cudaEventCreate</toolkit>
|
||||||
|
<toolkit>cudaEventRecord</toolkit>
|
||||||
|
<toolkit>cudaEventSynchronize</toolkit>
|
||||||
|
<toolkit>cudaEventElapsedTime</toolkit>
|
||||||
|
<toolkit>cudaFree</toolkit>
|
||||||
|
</cuda_api_list>
|
||||||
|
<description><![CDATA[CUDA sample demonstrating a integer GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API for integer introduced in CUDA 10. This sample demonstrates the use of the CUDA WMMA API employing the Tensor Cores introduced in the Volta chip family for faster matrix operations. In addition to that, it demonstrates the use of the new CUDA function attribute cudaFuncAttributeMaxDynamicSharedMemorySize that allows the application to reserve an extended amount of shared memory than it is available by default.]]></description>
|
||||||
|
<devicecompilation>whole</devicecompilation>
|
||||||
|
<includepaths>
|
||||||
|
<path>./</path>
|
||||||
|
<path>../</path>
|
||||||
|
<path>../../common/inc</path>
|
||||||
|
</includepaths>
|
||||||
|
<keyconcepts>
|
||||||
|
<concept level="basic">Matrix Multiply</concept>
|
||||||
|
<concept level="advanced">WMMA</concept>
|
||||||
|
<concept level="advanced">Tensor Cores</concept>
|
||||||
|
</keyconcepts>
|
||||||
|
<keywords>
|
||||||
|
</keywords>
|
||||||
|
<libraries>
|
||||||
|
</libraries>
|
||||||
|
<librarypaths>
|
||||||
|
</librarypaths>
|
||||||
|
<nsight_eclipse>true</nsight_eclipse>
|
||||||
|
<primary_file>immaTensorCoreGemm.cu</primary_file>
|
||||||
|
<scopes>
|
||||||
|
<scope>1:CUDA Basic Topics</scope>
|
||||||
|
</scopes>
|
||||||
|
<sm-arch>sm72</sm-arch>
|
||||||
|
<sm-arch>sm75</sm-arch>
|
||||||
|
<supported_envs>
|
||||||
|
<env>
|
||||||
|
<arch>x86_64</arch>
|
||||||
|
<platform>linux</platform>
|
||||||
|
</env>
|
||||||
|
<env>
|
||||||
|
<arch>aarch64</arch>
|
||||||
|
</env>
|
||||||
|
<env>
|
||||||
|
<platform>windows7</platform>
|
||||||
|
</env>
|
||||||
|
<env>
|
||||||
|
<arch>ppc64le</arch>
|
||||||
|
<platform>linux</platform>
|
||||||
|
</env>
|
||||||
|
</supported_envs>
|
||||||
|
<supported_sm_architectures>
|
||||||
|
<from>7.2</from>
|
||||||
|
</supported_sm_architectures>
|
||||||
|
<title>Tensor Core GEMM Integer MMA</title>
|
||||||
|
<type>exe</type>
|
||||||
|
</entry>
|
70
Samples/immaTensorCoreGemm/README.md
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
# immaTensorCoreGemm - Tensor Core GEMM Integer MMA
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
CUDA sample demonstrating a integer GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API for integer introduced in CUDA 10. This sample demonstrates the use of the CUDA WMMA API employing the Tensor Cores introduced in the Volta chip family for faster matrix operations. In addition to that, it demonstrates the use of the new CUDA function attribute cudaFuncAttributeMaxDynamicSharedMemorySize that allows the application to reserve an extended amount of shared memory than it is available by default.
|
||||||
|
|
||||||
|
## Key Concepts
|
||||||
|
|
||||||
|
Matrix Multiply, WMMA, Tensor Cores
|
||||||
|
|
||||||
|
## Supported SM Architectures
|
||||||
|
|
||||||
|
[SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
|
## Supported OSes
|
||||||
|
|
||||||
|
Linux, Windows
|
||||||
|
|
||||||
|
## Supported CPU Architecture
|
||||||
|
|
||||||
|
x86_64, ppc64le, aarch64
|
||||||
|
|
||||||
|
## CUDA APIs involved
|
||||||
|
|
||||||
|
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
|
||||||
|
cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, cudaEventRecord, cudaEventSynchronize, cudaEventElapsedTime, cudaFree
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
|
## Build and Run
|
||||||
|
|
||||||
|
### Windows
|
||||||
|
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
|
||||||
|
```
|
||||||
|
*_vs<version>.sln - for Visual Studio <version>
|
||||||
|
```
|
||||||
|
Each individual sample has its own set of solution files in its directory:
|
||||||
|
|
||||||
|
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
|
||||||
|
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
|
||||||
|
|
||||||
|
### Linux
|
||||||
|
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
|
||||||
|
```
|
||||||
|
$ cd <sample_dir>
|
||||||
|
$ make
|
||||||
|
```
|
||||||
|
The samples makefiles can take advantage of certain options:
|
||||||
|
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
|
||||||
|
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
|
||||||
|
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
|
||||||
|
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
|
||||||
|
* **dbg=1** - build with debug symbols
|
||||||
|
```
|
||||||
|
$ make dbg=1
|
||||||
|
```
|
||||||
|
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
|
||||||
|
```
|
||||||
|
$ make SMS="50 60"
|
||||||
|
```
|
||||||
|
|
||||||
|
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
|
||||||
|
```
|
||||||
|
$ make HOST_COMPILER=g++
|
||||||
|
```
|
||||||
|
|
||||||
|
## References (for more details)
|
||||||
|
|
655
Samples/immaTensorCoreGemm/immaTensorCoreGemm.cu
Normal file
|
@ -0,0 +1,655 @@
|
||||||
|
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived
|
||||||
|
* from this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// CUDA sample demonstrating a integer GEMM computation using the Warp Matrix
|
||||||
|
// Multiply and Accumulate API.
|
||||||
|
|
||||||
|
// In this program, the compute_gemm kernel computes the result of a matrix
|
||||||
|
// multiplication and addition: D = alpha * A * B + beta * C. The dimensions of
|
||||||
|
// both C and D matrices are M_GLOBAL x N_GLOBAL. The A matrix is M_GLOBAL x
|
||||||
|
// K_GLOBAL (row-major), the B matrix is K_GLOBAL x N_GLOBAL (column-major). In
|
||||||
|
// that kernel, each CTA computes one 128 x 128 tile of the resulting matrix per
|
||||||
|
// iteration. When the tile is computed, the CTA stores it to the global memory
|
||||||
|
// and begins a new iteration, selecting a new 128 x 128 tile to compute.
|
||||||
|
// Each CTA consists of eight warps. For the 128 x 128 tile, each warp computes
|
||||||
|
// eight 16 x 16 subtiles, organized in a 2 x 4 two-dimensional array. Warps
|
||||||
|
// compute the 16 x 16 subtiles using nvcuda::wmma::mma_sync operations by
|
||||||
|
// moving through the K_GLOBAL dimension of the A and B matrices and
|
||||||
|
// accumulating the intermediate result in the local thread state.
|
||||||
|
|
||||||
|
// There are a number of simple optimizations used in the algorithm:
|
||||||
|
// - The CTA copies the 128 x 128 tile of the C matrix from the global memory to
|
||||||
|
// shared memory. After that is done, each warp loads the C matrix fragments
|
||||||
|
// from shared memory, thus avoiding a random global memory access.
|
||||||
|
// - On each internal iteration, the CTA copies a portion of the A and B
|
||||||
|
// matrices from
|
||||||
|
// global memory to shared memory. After that, all warps in the CTA reuse the
|
||||||
|
// A and B data from shared memory, thus reducing the number of data copies
|
||||||
|
// from global memory.
|
||||||
|
// - The portions of the A and B matrices are stored in shared memory with an
|
||||||
|
// additional
|
||||||
|
// padding (skew) to reduce the number of shared memory access bank conflicts.
|
||||||
|
// (See a detailed explanation near the SKEW_HALF macro definition.)
|
||||||
|
// - When the CTA finishes computing the tiles of the resulting matrix, each
|
||||||
|
// warp stores
|
||||||
|
// its subtiles to shared memory. The CTA then copies the shared memory
|
||||||
|
// contents to global memory, again avoiding redundant random global memory
|
||||||
|
// accesses.
|
||||||
|
// - Note that the CTA tile size is chosen to maximize the GPU register
|
||||||
|
// utilization,
|
||||||
|
// but carefully enough to avoid local memory use.
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <mma.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
// helper functions and utilities to work with CUDA
|
||||||
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
|
|
||||||
|
// Externally configurable parameters.
|
||||||
|
|
||||||
|
#ifndef CPU_DEBUG
|
||||||
|
// Set this to 1 to verify the correctness of the GPU-computed matrix.
|
||||||
|
#define CPU_DEBUG 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef SHARED_MEMORY_LIMIT_64K
|
||||||
|
// Set this to 0 to use more than 64 Kb of shared memory to cache data, to
|
||||||
|
// improve the performance of the computations on GPU.
|
||||||
|
// Note that you need a GPU that can have more than 64 Kb of shared memory
|
||||||
|
// per multiprocessor.
|
||||||
|
#define SHARED_MEMORY_LIMIT_64K 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// GPU configuration.
|
||||||
|
|
||||||
|
#define WARP_SIZE 32
|
||||||
|
|
||||||
|
// MMA matrix tile dimensions.
|
||||||
|
|
||||||
|
#define M 16
|
||||||
|
#define N 16
|
||||||
|
#define K 16
|
||||||
|
|
||||||
|
#define WMMA_M 16
|
||||||
|
#define WMMA_N 16
|
||||||
|
#define WMMA_K 16
|
||||||
|
|
||||||
|
// GEMM configuration.
|
||||||
|
|
||||||
|
#define M_TILES 256
|
||||||
|
#define N_TILES 256
|
||||||
|
#define K_TILES 256
|
||||||
|
|
||||||
|
#define M_GLOBAL (M * M_TILES)
|
||||||
|
#define N_GLOBAL (N * N_TILES)
|
||||||
|
#define K_GLOBAL (K * K_TILES)
|
||||||
|
|
||||||
|
#define C_LAYOUT wmma::mem_row_major
|
||||||
|
|
||||||
|
// Implementation constants.
|
||||||
|
|
||||||
|
#define WARPS_PER_BLOCK 8
|
||||||
|
#define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK)
|
||||||
|
|
||||||
|
#if SHARED_MEMORY_LIMIT_64K
|
||||||
|
// With only 64 Kb shared memory available, we can fit two 8-tile chunks of
|
||||||
|
// the A and B matrix data, that are 16 * 16 * 8 * 8 * 2 = 32 Kb each
|
||||||
|
// (i.e. two 8x8 arrays of tiles of 16x16 uint8_t-typed elements per CTA).
|
||||||
|
// But we cannot account the 8 Kb total skew overhead, without which the
|
||||||
|
// performance would be severely impacted. So we choose to reduce the chunk size
|
||||||
|
// in half, i.e. the amount of A and B matrix data we cache in shared memory.
|
||||||
|
// Accordingly, this doubles the number of outer iterations across the global K
|
||||||
|
// dimension, which only slightly impacts the performance.
|
||||||
|
#define CHUNK_K 8
|
||||||
|
#else
|
||||||
|
#define CHUNK_K 16
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(uint8_t))
|
||||||
|
#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4))
|
||||||
|
#define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES)
|
||||||
|
#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP)
|
||||||
|
|
||||||
|
#define BLOCK_ROW_WARPS 2
|
||||||
|
#define BLOCK_COL_WARPS 4
|
||||||
|
|
||||||
|
#define WARP_ROW_TILES 4
|
||||||
|
#define WARP_COL_TILES 2
|
||||||
|
|
||||||
|
#define BLOCK_ROW_TILES (WARP_ROW_TILES * BLOCK_ROW_WARPS)
|
||||||
|
#define BLOCK_COL_TILES (WARP_COL_TILES * BLOCK_COL_WARPS)
|
||||||
|
|
||||||
|
#define GLOBAL_MEM_STRIDE N_GLOBAL
|
||||||
|
|
||||||
|
#define SHMEM_STRIDE (N * BLOCK_ROW_TILES)
|
||||||
|
#define SHMEM_OFFSET (N * WARP_ROW_TILES)
|
||||||
|
|
||||||
|
// The macro below is used to shift rows of the A matrix and columns of the B
|
||||||
|
// matrix in shared memory to minimize possible bank conflicts. Before
|
||||||
|
// performing the nvcuda::wmma::mma_sync operation, the warp must load the
|
||||||
|
// matrix data using the nvcuda::wmma::load_matrix_sync operation. Although the
|
||||||
|
// memory access pattern is not specified for that function, each lane in the
|
||||||
|
// warp can read one or multiple matrix elements from different matrix rows or
|
||||||
|
// columns. For shared memory, such access can result in bank conflicts if
|
||||||
|
// different rows / columns of the matrix map to the same bank. By shifting each
|
||||||
|
// row and column by a few bytes, we make sure that they map to different banks,
|
||||||
|
// thus reducing the number of possible bank conflicts. The number of 16
|
||||||
|
// one-byte "uint8_t" elements is chosen as the minimum possible shift because
|
||||||
|
// we must keep each row and column 128-bit aligned, as required by
|
||||||
|
// nvcuda::wmma::load_matrix_sync.
|
||||||
|
#define SKEW_UINT8 16
|
||||||
|
|
||||||
|
#define checkKernelErrors(expr) \
|
||||||
|
do { \
|
||||||
|
expr; \
|
||||||
|
\
|
||||||
|
cudaError_t __err = cudaGetLastError(); \
|
||||||
|
if (__err != cudaSuccess) { \
|
||||||
|
printf("Line %d: '%s' failed: %s\n", __LINE__, #expr, \
|
||||||
|
cudaGetErrorString(__err)); \
|
||||||
|
abort(); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
using namespace nvcuda;
|
||||||
|
|
||||||
|
__host__ void init_host_matrices(uint8_t *a, uint8_t *b, int *c) {
|
||||||
|
for (int i = 0; i < M_GLOBAL; i++) {
|
||||||
|
for (int j = 0; j < K_GLOBAL; j++) {
|
||||||
|
a[i * K_GLOBAL + j] = (uint8_t)(rand() % 3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < N_GLOBAL; i++) {
|
||||||
|
for (int j = 0; j < K_GLOBAL; j++) {
|
||||||
|
b[i * K_GLOBAL + j] = (uint8_t)(rand() % 3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) {
|
||||||
|
c[t] = (rand() % 3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void compute_gemm_imma(const uint8_t *A, const uint8_t *B,
|
||||||
|
const int *C, int *D, int alpha, int beta) {
|
||||||
|
extern __shared__ uint8_t shmem[][CHUNK_K * K + SKEW_UINT8];
|
||||||
|
|
||||||
|
// Warp and lane identification.
|
||||||
|
const unsigned int warpId = threadIdx.x / WARP_SIZE;
|
||||||
|
const unsigned int laneId = threadIdx.x % WARP_SIZE;
|
||||||
|
|
||||||
|
// Offset in shared memory from which the B matrix is stored.
|
||||||
|
const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
|
||||||
|
|
||||||
|
// This pointer is used to access the C and D matrix tiles this warp computes.
|
||||||
|
int *shmem_warp_tile_ptr = (int *)&shmem[0][0] +
|
||||||
|
(warpId / 2) * SHMEM_STRIDE * K * 2 +
|
||||||
|
(warpId % 2) * SHMEM_OFFSET;
|
||||||
|
|
||||||
|
// This pointer is used to stream the C and D matrices block-wide tile to and
|
||||||
|
// from shared memory.
|
||||||
|
int *shmem_warp_stream_ptr = (int *)&shmem[0][0] + warpId * SHMEM_STRIDE * K;
|
||||||
|
|
||||||
|
// Adjust the beta scaler, as it'll be multiplied by alpha at the end of
|
||||||
|
// each tile computation. Technically this is not generally correct (may
|
||||||
|
// result in a loss of precision). Zero still needs to be specially handled
|
||||||
|
// though.
|
||||||
|
beta /= alpha;
|
||||||
|
|
||||||
|
// Each CTA slides along the 128 x 128 tiles from the top left corner of the
|
||||||
|
// matrix to the right and down, and selects the next tile to compute. Once
|
||||||
|
// there's no such tile, all warps in this CTA exit.
|
||||||
|
for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
|
||||||
|
const unsigned int block_tile_i =
|
||||||
|
((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
|
||||||
|
const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
|
||||||
|
|
||||||
|
// Stop when there are no more D matrix tiles to compute in this CTA.
|
||||||
|
if (block_tile_i >= M_TILES) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This warp's pointer to the C matrix data to copy memory from to shared
|
||||||
|
// memory.
|
||||||
|
const size_t gmem_idx =
|
||||||
|
(block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
|
||||||
|
const int *src_gmem_warp_stream_ptr = &C[gmem_idx];
|
||||||
|
|
||||||
|
// Stream multiple C tiles to shared memory.
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < K; i++) {
|
||||||
|
typedef int4 copy_t;
|
||||||
|
|
||||||
|
*((copy_t *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) =
|
||||||
|
*((copy_t *)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) +
|
||||||
|
laneId);
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// These fragments will accumulate the result of A and B matrix fragment
|
||||||
|
// multiplications along the K_GLOBAL dimension.
|
||||||
|
wmma::fragment<wmma::accumulator, M, N, K, int> c[WARP_COL_TILES]
|
||||||
|
[WARP_ROW_TILES];
|
||||||
|
|
||||||
|
// Load the C matrix tiles into fragments from shared memory.
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < WARP_COL_TILES; i++) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < WARP_ROW_TILES; j++) {
|
||||||
|
const int *tile_ptr =
|
||||||
|
shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
|
||||||
|
|
||||||
|
wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Scale the C matrix.
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < WARP_COL_TILES; i++) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < WARP_ROW_TILES; j++) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int t = 0; t < c[i][j].num_elements; t++) {
|
||||||
|
c[i][j].x[t] *= beta;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Select what warp copies what matrix to shared memory.
|
||||||
|
// Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
|
||||||
|
const uint8_t *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] +
|
||||||
|
M * K_GLOBAL * (warpId % 4) * 2)
|
||||||
|
: (&B[block_tile_j * N * K_GLOBAL] +
|
||||||
|
N * K_GLOBAL * (warpId % 4) * 2);
|
||||||
|
|
||||||
|
// Go through the global K dimension by a fixed step at a time.
|
||||||
|
#pragma unroll
|
||||||
|
for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
|
||||||
|
// Copy slices of the A and B matrices to shared memory.
|
||||||
|
// The first half of the warps in the CTA copy the A matrix, the rest copy
|
||||||
|
// the B matrix.
|
||||||
|
size_t shmem_idx =
|
||||||
|
warpId < (WARPS_PER_BLOCK / 2)
|
||||||
|
? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2)
|
||||||
|
: (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off);
|
||||||
|
|
||||||
|
// First half of the warp copies the first row / column of the matrix,
|
||||||
|
// the second half of the warp copies the next.
|
||||||
|
int4 *lane_ptr = (int4 *)(warp_ptr + tile_k * K +
|
||||||
|
(laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) +
|
||||||
|
(laneId % CHUNK_COPY_LINE_LANES);
|
||||||
|
|
||||||
|
// Shift the second half of the warp to the next row / column in the
|
||||||
|
// shared memory.
|
||||||
|
shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2;
|
||||||
|
i++) {
|
||||||
|
// Copy 16 bytes at once in each lane.
|
||||||
|
*((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) =
|
||||||
|
*lane_ptr;
|
||||||
|
|
||||||
|
// Advance the global memory pointer and the shared memory index.
|
||||||
|
lane_ptr = (int4 *)((uint8_t *)lane_ptr +
|
||||||
|
K_GLOBAL * CHUNK_COPY_LINES_PER_WARP);
|
||||||
|
shmem_idx += CHUNK_COPY_LINES_PER_WARP;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Compute a grid of C matrix tiles in each warp.
|
||||||
|
#pragma unroll
|
||||||
|
for (int k_step = 0; k_step < CHUNK_K; k_step++) {
|
||||||
|
wmma::fragment<wmma::matrix_a, M, N, K, uint8_t, wmma::row_major>
|
||||||
|
a[WARP_COL_TILES];
|
||||||
|
wmma::fragment<wmma::matrix_b, M, N, K, uint8_t, wmma::col_major>
|
||||||
|
b[WARP_ROW_TILES];
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < WARP_COL_TILES; i++) {
|
||||||
|
size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M);
|
||||||
|
const uint8_t *tile_ptr = &shmem[shmem_idx_a][k_step * K];
|
||||||
|
|
||||||
|
wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_UINT8);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < WARP_ROW_TILES; j++) {
|
||||||
|
if (i == 0) {
|
||||||
|
// Load the B matrix fragment once, because it is going to be
|
||||||
|
// reused against the other A matrix fragments.
|
||||||
|
size_t shmem_idx_b = shmem_idx_b_off +
|
||||||
|
(WARP_ROW_TILES * N) * (warpId % 2) +
|
||||||
|
(j * N);
|
||||||
|
const uint8_t *tile_ptr = &shmem[shmem_idx_b][k_step * K];
|
||||||
|
|
||||||
|
wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_UINT8);
|
||||||
|
}
|
||||||
|
|
||||||
|
wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store the D fragments to shared memory.
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < WARP_COL_TILES; i++) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < WARP_ROW_TILES; j++) {
|
||||||
|
#pragma unroll
|
||||||
|
// Uniform, point-wise transformations of ALL fragment elements by ALL
|
||||||
|
// threads in the warp are well-defined even though element indices
|
||||||
|
// within fragment storage are not defined.
|
||||||
|
for (int t = 0; t < c[i][j].num_elements; t++) c[i][j].x[t] *= alpha;
|
||||||
|
|
||||||
|
int *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
|
||||||
|
|
||||||
|
wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Now that shared memory contains all the D tiles, stream them to global
|
||||||
|
// memory.
|
||||||
|
int *dst_gmem_warp_stream_ptr = &D[gmem_idx];
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < K; i++) {
|
||||||
|
*((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
|
||||||
|
*((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming:
|
||||||
|
// 1) Matrices are packed in memory.
|
||||||
|
// 2) M, N and K are multiples of 16.
|
||||||
|
// 3) Neither A nor B are transposed.
|
||||||
|
// Note: This is a less performant version of the compute_gemm_imma kernel. It
|
||||||
|
// is designed for
|
||||||
|
// demonstration purposes only to show the CUDA WMMA API use without
|
||||||
|
// relying on availability of the shared memory.
|
||||||
|
__global__ void simple_wmma_gemm_imma(const uint8_t *a, const uint8_t *b,
|
||||||
|
const int *c, int *d, int m_ld, int n_ld,
|
||||||
|
int k_ld, int alpha, int beta) {
|
||||||
|
// Leading dimensions. Packed with no transpositions.
|
||||||
|
int lda = m_ld;
|
||||||
|
int ldb = k_ld;
|
||||||
|
int ldc = n_ld;
|
||||||
|
|
||||||
|
// Tile using a 2D grid
|
||||||
|
int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
|
||||||
|
int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
|
||||||
|
|
||||||
|
// Declare the fragments
|
||||||
|
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, uint8_t,
|
||||||
|
wmma::row_major>
|
||||||
|
a_frag;
|
||||||
|
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, uint8_t,
|
||||||
|
wmma::col_major>
|
||||||
|
b_frag;
|
||||||
|
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> acc_frag;
|
||||||
|
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> c_frag;
|
||||||
|
|
||||||
|
wmma::fill_fragment(acc_frag, 0.0f);
|
||||||
|
|
||||||
|
// Loop over k
|
||||||
|
for (int i = 0; i < k_ld; i += WMMA_K) {
|
||||||
|
int aCol = i;
|
||||||
|
int aRow = warpM * WMMA_M;
|
||||||
|
|
||||||
|
int bCol = i;
|
||||||
|
int bRow = warpN * WMMA_N;
|
||||||
|
|
||||||
|
// Bounds checking
|
||||||
|
if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {
|
||||||
|
// Load the inputs
|
||||||
|
wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda);
|
||||||
|
wmma::load_matrix_sync(b_frag, b + bCol + bRow * ldb, ldb);
|
||||||
|
|
||||||
|
// Perform the matrix multiplication
|
||||||
|
wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load in the current value of c, scale it by beta, and add this our result
|
||||||
|
// scaled by alpha
|
||||||
|
int cCol = warpN * WMMA_N;
|
||||||
|
int cRow = warpM * WMMA_M;
|
||||||
|
|
||||||
|
if (cRow < m_ld && cCol < n_ld) {
|
||||||
|
wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc,
|
||||||
|
wmma::mem_row_major);
|
||||||
|
|
||||||
|
for (int i = 0; i < c_frag.num_elements; i++) {
|
||||||
|
c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store the output
|
||||||
|
wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc,
|
||||||
|
wmma::mem_row_major);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__ void matMultiplyOnHost(uint8_t *A, uint8_t *B, int *C, int alpha,
|
||||||
|
int beta, int numARows, int numAColumns,
|
||||||
|
int numBRows, int numBColumns, int numCRows,
|
||||||
|
int numCColumns) {
|
||||||
|
for (int i = 0; i < numCRows; i++) {
|
||||||
|
for (int j = 0; j < numCColumns; j++) {
|
||||||
|
int temp = 0;
|
||||||
|
|
||||||
|
for (int k = 0; k < numAColumns; k++) {
|
||||||
|
temp += A[i * numAColumns + k] * B[j * numBRows + k];
|
||||||
|
}
|
||||||
|
|
||||||
|
C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
printf("Initializing...\n");
|
||||||
|
|
||||||
|
int dev = findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
|
cudaDeviceProp deviceProp;
|
||||||
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
|
||||||
|
|
||||||
|
// Tensor cores require a GPU of Volta (SM72) architecture or higher.
|
||||||
|
if (deviceProp.major < 7 || (deviceProp.major <= 7 && deviceProp.minor < 2)) {
|
||||||
|
printf(
|
||||||
|
"immaTensorCoreGemm requires SM 7.2 or higher to use Tensor Cores. "
|
||||||
|
"Exiting...\n");
|
||||||
|
exit(EXIT_WAIVED);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES);
|
||||||
|
printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
|
||||||
|
printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
|
||||||
|
|
||||||
|
uint8_t *A_h = NULL;
|
||||||
|
uint8_t *B_h = NULL;
|
||||||
|
int *C_h = NULL;
|
||||||
|
#if CPU_DEBUG
|
||||||
|
int *result_hD = NULL;
|
||||||
|
int *result_host = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
A_h = (uint8_t *)malloc(sizeof(uint8_t) * M_GLOBAL * K_GLOBAL);
|
||||||
|
B_h = (uint8_t *)malloc(sizeof(uint8_t) * K_GLOBAL * N_GLOBAL);
|
||||||
|
C_h = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL);
|
||||||
|
#if CPU_DEBUG
|
||||||
|
result_hD = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL);
|
||||||
|
result_host = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
uint8_t *A = NULL;
|
||||||
|
uint8_t *B = NULL;
|
||||||
|
int *C = NULL;
|
||||||
|
int *D = NULL;
|
||||||
|
|
||||||
|
checkCudaErrors(
|
||||||
|
cudaMalloc(reinterpret_cast<void **>(&A), sizeof(uint8_t) * M_GLOBAL * K_GLOBAL));
|
||||||
|
checkCudaErrors(
|
||||||
|
cudaMalloc(reinterpret_cast<void **>(&B), sizeof(uint8_t) * N_GLOBAL * K_GLOBAL));
|
||||||
|
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&C), sizeof(int) * M_GLOBAL * N_GLOBAL));
|
||||||
|
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&D), sizeof(int) * M_GLOBAL * N_GLOBAL));
|
||||||
|
|
||||||
|
assert(((unsigned long long)A) % 128 == 0);
|
||||||
|
assert(((unsigned long long)B) % 128 == 0);
|
||||||
|
assert(((unsigned long long)C) % 128 == 0);
|
||||||
|
assert(((unsigned long long)D) % 128 == 0);
|
||||||
|
|
||||||
|
init_host_matrices(A_h, B_h, C_h);
|
||||||
|
|
||||||
|
checkCudaErrors(cudaMemcpy(A, A_h, sizeof(uint8_t) * M_GLOBAL * K_GLOBAL,
|
||||||
|
cudaMemcpyHostToDevice));
|
||||||
|
checkCudaErrors(cudaMemcpy(B, B_h, sizeof(uint8_t) * N_GLOBAL * K_GLOBAL,
|
||||||
|
cudaMemcpyHostToDevice));
|
||||||
|
checkCudaErrors(cudaMemcpy(C, C_h, sizeof(int) * M_GLOBAL * N_GLOBAL,
|
||||||
|
cudaMemcpyHostToDevice));
|
||||||
|
checkCudaErrors(cudaMemset(D, 0, sizeof(int) * M_GLOBAL * N_GLOBAL));
|
||||||
|
|
||||||
|
printf("Preparing data for GPU...\n");
|
||||||
|
|
||||||
|
assert(((unsigned long long)A) % 128 == 0);
|
||||||
|
assert(((unsigned long long)B) % 128 == 0);
|
||||||
|
assert(((unsigned long long)C) % 128 == 0);
|
||||||
|
assert(((unsigned long long)D) % 128 == 0);
|
||||||
|
|
||||||
|
enum {
|
||||||
|
// Compute the right amount of shared memory to request.
|
||||||
|
// We need shared memory to hold per-CTA C and D matrix tiles, and to cache
|
||||||
|
// per-CTA chunks
|
||||||
|
// of the A and B matrices. Therefore, the right amount to request is the
|
||||||
|
// maximum of those
|
||||||
|
// two numbers.
|
||||||
|
SHMEM_SZ = MAX(sizeof(uint8_t) * (BLOCK_COL_TILES * M) *
|
||||||
|
(CHUNK_K * K + SKEW_UINT8) * 2,
|
||||||
|
M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N *
|
||||||
|
(BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(int))
|
||||||
|
};
|
||||||
|
|
||||||
|
printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL);
|
||||||
|
|
||||||
|
int alpha = 1;
|
||||||
|
int beta = 1;
|
||||||
|
|
||||||
|
cudaEvent_t start, stop;
|
||||||
|
|
||||||
|
checkCudaErrors(cudaEventCreate(&start));
|
||||||
|
checkCudaErrors(cudaEventCreate(&stop));
|
||||||
|
checkCudaErrors(cudaEventRecord(start));
|
||||||
|
|
||||||
|
// If enough shared memory available on the GPU use high performant kernel
|
||||||
|
if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) {
|
||||||
|
printf("Computing... using high performance kernel compute_gemm_imma \n");
|
||||||
|
|
||||||
|
checkCudaErrors(cudaFuncSetAttribute(
|
||||||
|
compute_gemm_imma, cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||||
|
SHMEM_SZ));
|
||||||
|
checkKernelErrors(
|
||||||
|
(compute_gemm_imma<<<deviceProp.multiProcessorCount, THREADS_PER_BLOCK,
|
||||||
|
SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
|
||||||
|
#if CPU_DEBUG
|
||||||
|
checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(int) * M_GLOBAL * N_GLOBAL,
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
dim3 gridDim;
|
||||||
|
dim3 blockDim;
|
||||||
|
|
||||||
|
// blockDim.x must be a multiple of warpSize
|
||||||
|
// 128x4 means we have 16 warps and a block computes a 64x64 output tile
|
||||||
|
blockDim.x = 128;
|
||||||
|
blockDim.y = 4;
|
||||||
|
|
||||||
|
gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) /
|
||||||
|
(WMMA_M * blockDim.x / 32);
|
||||||
|
gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y);
|
||||||
|
|
||||||
|
printf("Computing... using simple_wmma_gemm_imma kernel\n");
|
||||||
|
simple_wmma_gemm_imma<<<gridDim, blockDim>>>(A, B, C, D, M_GLOBAL, N_GLOBAL,
|
||||||
|
K_GLOBAL, alpha, beta);
|
||||||
|
#if CPU_DEBUG
|
||||||
|
checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(int) * M_GLOBAL * N_GLOBAL,
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
checkCudaErrors(cudaEventRecord(stop));
|
||||||
|
checkCudaErrors(cudaEventSynchronize(stop));
|
||||||
|
|
||||||
|
#if CPU_DEBUG
|
||||||
|
printf("Verifying correctness of the computations...\n");
|
||||||
|
|
||||||
|
memcpy(result_host, C_h, sizeof(int) * M_GLOBAL * N_GLOBAL);
|
||||||
|
|
||||||
|
matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL,
|
||||||
|
K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL);
|
||||||
|
|
||||||
|
for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) {
|
||||||
|
if (abs(result_hD[i] - result_host[i]) > 0) {
|
||||||
|
printf("mismatch i=%d result_hD=%d result_host=%d\n", i, result_hD[i],
|
||||||
|
result_host[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(result_host);
|
||||||
|
free(result_hD);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
float milliseconds = 0;
|
||||||
|
|
||||||
|
checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));
|
||||||
|
|
||||||
|
printf("Time: %f ms\n", milliseconds);
|
||||||
|
printf("TOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2)/(milliseconds/1000.)) / 1e12);
|
||||||
|
|
||||||
|
free(A_h);
|
||||||
|
free(B_h);
|
||||||
|
free(C_h);
|
||||||
|
checkCudaErrors(cudaFree(reinterpret_cast<void *>(A)));
|
||||||
|
checkCudaErrors(cudaFree(reinterpret_cast<void *>(B)));
|
||||||
|
checkCudaErrors(cudaFree(reinterpret_cast<void *>(C)));
|
||||||
|
checkCudaErrors(cudaFree(reinterpret_cast<void *>(D)));
|
||||||
|
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
20
Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.sln
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
|
# Visual Studio 2012
|
||||||
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|x64 = Debug|x64
|
||||||
|
Release|x64 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
107
Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.vcxproj
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<PropertyGroup>
|
||||||
|
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
|
<ProjectConfiguration Include="Debug|x64">
|
||||||
|
<Configuration>Debug</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Release|x64">
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
</ItemGroup>
|
||||||
|
<PropertyGroup Label="Globals">
|
||||||
|
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
|
||||||
|
<RootNamespace>immaTensorCoreGemm_vs2012</RootNamespace>
|
||||||
|
<ProjectName>immaTensorCoreGemm</ProjectName>
|
||||||
|
<CudaToolkitCustomDir />
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
|
<PlatformToolset>v110</PlatformToolset>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
|
<ImportGroup Label="ExtensionSettings">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<ImportGroup Label="PropertySheets">
|
||||||
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<PropertyGroup Label="UserMacros" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<IntDir>$(Platform)/$(Configuration)/</IntDir>
|
||||||
|
<IncludePath>$(IncludePath)</IncludePath>
|
||||||
|
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
|
||||||
|
<CodeAnalysisRules />
|
||||||
|
<CodeAnalysisRuleAssemblies />
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Platform)'=='x64'">
|
||||||
|
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemDefinitionGroup>
|
||||||
|
<ClCompile>
|
||||||
|
<WarningLevel>Level3</WarningLevel>
|
||||||
|
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<SubSystem>Console</SubSystem>
|
||||||
|
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||||
|
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
|
||||||
|
<OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<CodeGeneration>compute_75,sm_75;</CodeGeneration>
|
||||||
|
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<Include>./;../../Common</Include>
|
||||||
|
<Defines>WIN32</Defines>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>Disabled</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MTd</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>MaxSpeed</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>false</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MT</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<CudaCompile Include="immaTensorCoreGemm.cu" />
|
||||||
|
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
|
</ImportGroup>
|
||||||
|
</Project>
|
20
Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.sln
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 13.00
|
||||||
|
# Visual Studio 2013
|
||||||
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|x64 = Debug|x64
|
||||||
|
Release|x64 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
107
Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.vcxproj
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<PropertyGroup>
|
||||||
|
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
|
<ProjectConfiguration Include="Debug|x64">
|
||||||
|
<Configuration>Debug</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Release|x64">
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
</ItemGroup>
|
||||||
|
<PropertyGroup Label="Globals">
|
||||||
|
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
|
||||||
|
<RootNamespace>immaTensorCoreGemm_vs2013</RootNamespace>
|
||||||
|
<ProjectName>immaTensorCoreGemm</ProjectName>
|
||||||
|
<CudaToolkitCustomDir />
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
|
<PlatformToolset>v120</PlatformToolset>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
|
<ImportGroup Label="ExtensionSettings">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<ImportGroup Label="PropertySheets">
|
||||||
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<PropertyGroup Label="UserMacros" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<IntDir>$(Platform)/$(Configuration)/</IntDir>
|
||||||
|
<IncludePath>$(IncludePath)</IncludePath>
|
||||||
|
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
|
||||||
|
<CodeAnalysisRules />
|
||||||
|
<CodeAnalysisRuleAssemblies />
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Platform)'=='x64'">
|
||||||
|
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemDefinitionGroup>
|
||||||
|
<ClCompile>
|
||||||
|
<WarningLevel>Level3</WarningLevel>
|
||||||
|
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<SubSystem>Console</SubSystem>
|
||||||
|
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||||
|
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
|
||||||
|
<OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<CodeGeneration>compute_75,sm_75;</CodeGeneration>
|
||||||
|
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<Include>./;../../Common</Include>
|
||||||
|
<Defines>WIN32</Defines>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>Disabled</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MTd</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>MaxSpeed</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>false</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MT</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<CudaCompile Include="immaTensorCoreGemm.cu" />
|
||||||
|
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
|
</ImportGroup>
|
||||||
|
</Project>
|
20
Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.sln
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 14.00
|
||||||
|
# Visual Studio 2015
|
||||||
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|x64 = Debug|x64
|
||||||
|
Release|x64 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
107
Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.vcxproj
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<PropertyGroup>
|
||||||
|
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
|
<ProjectConfiguration Include="Debug|x64">
|
||||||
|
<Configuration>Debug</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Release|x64">
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
</ItemGroup>
|
||||||
|
<PropertyGroup Label="Globals">
|
||||||
|
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
|
||||||
|
<RootNamespace>immaTensorCoreGemm_vs2015</RootNamespace>
|
||||||
|
<ProjectName>immaTensorCoreGemm</ProjectName>
|
||||||
|
<CudaToolkitCustomDir />
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
|
<PlatformToolset>v140</PlatformToolset>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
|
<ImportGroup Label="ExtensionSettings">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<ImportGroup Label="PropertySheets">
|
||||||
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<PropertyGroup Label="UserMacros" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<IntDir>$(Platform)/$(Configuration)/</IntDir>
|
||||||
|
<IncludePath>$(IncludePath)</IncludePath>
|
||||||
|
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
|
||||||
|
<CodeAnalysisRules />
|
||||||
|
<CodeAnalysisRuleAssemblies />
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Platform)'=='x64'">
|
||||||
|
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemDefinitionGroup>
|
||||||
|
<ClCompile>
|
||||||
|
<WarningLevel>Level3</WarningLevel>
|
||||||
|
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<SubSystem>Console</SubSystem>
|
||||||
|
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||||
|
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
|
||||||
|
<OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<CodeGeneration>compute_75,sm_75;</CodeGeneration>
|
||||||
|
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<Include>./;../../Common</Include>
|
||||||
|
<Defines>WIN32</Defines>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>Disabled</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MTd</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>MaxSpeed</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>false</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MT</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<CudaCompile Include="immaTensorCoreGemm.cu" />
|
||||||
|
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
|
</ImportGroup>
|
||||||
|
</Project>
|
20
Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.sln
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
|
# Visual Studio 2017
|
||||||
|
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|x64 = Debug|x64
|
||||||
|
Release|x64 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
|
||||||
|
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
108
Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<PropertyGroup>
|
||||||
|
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup Label="ProjectConfigurations">
|
||||||
|
<ProjectConfiguration Include="Debug|x64">
|
||||||
|
<Configuration>Debug</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
<ProjectConfiguration Include="Release|x64">
|
||||||
|
<Configuration>Release</Configuration>
|
||||||
|
<Platform>x64</Platform>
|
||||||
|
</ProjectConfiguration>
|
||||||
|
</ItemGroup>
|
||||||
|
<PropertyGroup Label="Globals">
|
||||||
|
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
|
||||||
|
<RootNamespace>immaTensorCoreGemm_vs2017</RootNamespace>
|
||||||
|
<ProjectName>immaTensorCoreGemm</ProjectName>
|
||||||
|
<CudaToolkitCustomDir />
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
|
<CharacterSet>MultiByte</CharacterSet>
|
||||||
|
<PlatformToolset>v141</PlatformToolset>
|
||||||
|
<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||||
|
</PropertyGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
|
<ImportGroup Label="ExtensionSettings">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<ImportGroup Label="PropertySheets">
|
||||||
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
</ImportGroup>
|
||||||
|
<PropertyGroup Label="UserMacros" />
|
||||||
|
<PropertyGroup>
|
||||||
|
<IntDir>$(Platform)/$(Configuration)/</IntDir>
|
||||||
|
<IncludePath>$(IncludePath)</IncludePath>
|
||||||
|
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
|
||||||
|
<CodeAnalysisRules />
|
||||||
|
<CodeAnalysisRuleAssemblies />
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition="'$(Platform)'=='x64'">
|
||||||
|
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemDefinitionGroup>
|
||||||
|
<ClCompile>
|
||||||
|
<WarningLevel>Level3</WarningLevel>
|
||||||
|
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||||
|
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<SubSystem>Console</SubSystem>
|
||||||
|
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||||
|
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
|
||||||
|
<OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<CodeGeneration>compute_75,sm_75;</CodeGeneration>
|
||||||
|
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<Include>./;../../Common</Include>
|
||||||
|
<Defines>WIN32</Defines>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>Disabled</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MTd</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
|
||||||
|
<ClCompile>
|
||||||
|
<Optimization>MaxSpeed</Optimization>
|
||||||
|
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||||
|
</ClCompile>
|
||||||
|
<Link>
|
||||||
|
<GenerateDebugInformation>false</GenerateDebugInformation>
|
||||||
|
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
|
||||||
|
</Link>
|
||||||
|
<CudaCompile>
|
||||||
|
<Runtime>MT</Runtime>
|
||||||
|
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||||
|
</CudaCompile>
|
||||||
|
</ItemDefinitionGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<CudaCompile Include="immaTensorCoreGemm.cu" />
|
||||||
|
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
|
</ImportGroup>
|
||||||
|
</Project>
|
|
@ -246,7 +246,11 @@ LIBRARIES :=
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
# Gencode arguments
|
# Gencode arguments
|
||||||
|
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
|
||||||
|
SMS ?= 30 35 37 50 52 60 61 70 72 75
|
||||||
|
else
|
||||||
SMS ?= 30 35 37 50 52 60 61 70 75
|
SMS ?= 30 35 37 50 52 60 61 70 75
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(SMS),)
|
ifeq ($(SMS),)
|
||||||
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||||
|
|
|
@ -46,6 +46,7 @@
|
||||||
<sm-arch>sm60</sm-arch>
|
<sm-arch>sm60</sm-arch>
|
||||||
<sm-arch>sm61</sm-arch>
|
<sm-arch>sm61</sm-arch>
|
||||||
<sm-arch>sm70</sm-arch>
|
<sm-arch>sm70</sm-arch>
|
||||||
|
<sm-arch>sm72</sm-arch>
|
||||||
<sm-arch>sm75</sm-arch>
|
<sm-arch>sm75</sm-arch>
|
||||||
<supported_envs>
|
<supported_envs>
|
||||||
<env>
|
<env>
|
||||||
|
|
|
@ -10,7 +10,7 @@ CUDA Runtime API, Linear Algebra
|
||||||
|
|
||||||
## Supported SM Architectures
|
## Supported SM Architectures
|
||||||
|
|
||||||
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
## Supported OSes
|
## Supported OSes
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -102,6 +102,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -102,6 +102,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -102,6 +102,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -103,6 +103,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -10,7 +10,7 @@ CUDA Driver API, Matrix Multiply
|
||||||
|
|
||||||
## Supported SM Architectures
|
## Supported SM Architectures
|
||||||
|
|
||||||
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
## Supported OSes
|
## Supported OSes
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -106,6 +106,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -106,6 +106,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -106,6 +106,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="PropertySheets">
|
<ImportGroup Label="PropertySheets">
|
||||||
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
|
||||||
|
@ -107,6 +107,6 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
<Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
|
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
301
Samples/nvJPEG/Makefile
Normal file
|
@ -0,0 +1,301 @@
|
||||||
|
################################################################################
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions
|
||||||
|
# are met:
|
||||||
|
# * Redistributions of source code must retain the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer in the
|
||||||
|
# documentation and/or other materials provided with the distribution.
|
||||||
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||||
|
# contributors may be used to endorse or promote products derived
|
||||||
|
# from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||||
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||||
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||||
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
#
|
||||||
|
################################################################################
|
||||||
|
#
|
||||||
|
# Makefile project only supported on Mac OS X and Linux Platforms)
|
||||||
|
#
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Location of the CUDA Toolkit
|
||||||
|
CUDA_PATH ?= /usr/local/cuda
|
||||||
|
|
||||||
|
##############################
|
||||||
|
# start deprecated interface #
|
||||||
|
##############################
|
||||||
|
ifeq ($(x86_64),1)
|
||||||
|
$(info WARNING - x86_64 variable has been deprecated)
|
||||||
|
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
|
||||||
|
TARGET_ARCH ?= x86_64
|
||||||
|
endif
|
||||||
|
ifeq ($(ARMv7),1)
|
||||||
|
$(info WARNING - ARMv7 variable has been deprecated)
|
||||||
|
$(info WARNING - please use TARGET_ARCH=armv7l instead)
|
||||||
|
TARGET_ARCH ?= armv7l
|
||||||
|
endif
|
||||||
|
ifeq ($(aarch64),1)
|
||||||
|
$(info WARNING - aarch64 variable has been deprecated)
|
||||||
|
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
|
||||||
|
TARGET_ARCH ?= aarch64
|
||||||
|
endif
|
||||||
|
ifeq ($(ppc64le),1)
|
||||||
|
$(info WARNING - ppc64le variable has been deprecated)
|
||||||
|
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
|
||||||
|
TARGET_ARCH ?= ppc64le
|
||||||
|
endif
|
||||||
|
ifneq ($(GCC),)
|
||||||
|
$(info WARNING - GCC variable has been deprecated)
|
||||||
|
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
|
||||||
|
HOST_COMPILER ?= $(GCC)
|
||||||
|
endif
|
||||||
|
ifneq ($(abi),)
|
||||||
|
$(error ERROR - abi variable has been removed)
|
||||||
|
endif
|
||||||
|
############################
|
||||||
|
# end deprecated interface #
|
||||||
|
############################
|
||||||
|
|
||||||
|
# architecture
|
||||||
|
HOST_ARCH := $(shell uname -m)
|
||||||
|
TARGET_ARCH ?= $(HOST_ARCH)
|
||||||
|
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
|
||||||
|
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||||
|
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
|
||||||
|
TARGET_SIZE := 64
|
||||||
|
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
|
||||||
|
TARGET_SIZE := 32
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
TARGET_SIZE := $(shell getconf LONG_BIT)
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
|
||||||
|
endif
|
||||||
|
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||||
|
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
|
||||||
|
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
|
||||||
|
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
|
||||||
|
TARGET_ARCH = armv7l
|
||||||
|
endif
|
||||||
|
|
||||||
|
# operating system
|
||||||
|
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
|
||||||
|
TARGET_OS ?= $(HOST_OS)
|
||||||
|
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
|
||||||
|
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
|
||||||
|
endif
|
||||||
|
|
||||||
|
# host compiler
|
||||||
|
ifeq ($(TARGET_OS),darwin)
|
||||||
|
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
|
||||||
|
HOST_COMPILER ?= clang++
|
||||||
|
endif
|
||||||
|
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||||
|
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
|
||||||
|
ifeq ($(TARGET_OS),linux)
|
||||||
|
HOST_COMPILER ?= arm-linux-gnueabihf-g++
|
||||||
|
else ifeq ($(TARGET_OS),qnx)
|
||||||
|
ifeq ($(QNX_HOST),)
|
||||||
|
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||||
|
endif
|
||||||
|
ifeq ($(QNX_TARGET),)
|
||||||
|
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||||
|
endif
|
||||||
|
export QNX_HOST
|
||||||
|
export QNX_TARGET
|
||||||
|
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
|
||||||
|
else ifeq ($(TARGET_OS),android)
|
||||||
|
HOST_COMPILER ?= arm-linux-androideabi-g++
|
||||||
|
endif
|
||||||
|
else ifeq ($(TARGET_ARCH),aarch64)
|
||||||
|
ifeq ($(TARGET_OS), linux)
|
||||||
|
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||||
|
else ifeq ($(TARGET_OS),qnx)
|
||||||
|
ifeq ($(QNX_HOST),)
|
||||||
|
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||||
|
endif
|
||||||
|
ifeq ($(QNX_TARGET),)
|
||||||
|
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||||
|
endif
|
||||||
|
export QNX_HOST
|
||||||
|
export QNX_TARGET
|
||||||
|
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
|
||||||
|
else ifeq ($(TARGET_OS), android)
|
||||||
|
HOST_COMPILER ?= aarch64-linux-android-clang++
|
||||||
|
endif
|
||||||
|
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||||
|
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
HOST_COMPILER ?= g++
|
||||||
|
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
|
||||||
|
|
||||||
|
# internal flags
|
||||||
|
NVCCFLAGS := -m${TARGET_SIZE}
|
||||||
|
CCFLAGS :=
|
||||||
|
LDFLAGS :=
|
||||||
|
|
||||||
|
# build flags
|
||||||
|
ifeq ($(TARGET_OS),darwin)
|
||||||
|
LDFLAGS += -rpath $(CUDA_PATH)/lib
|
||||||
|
CCFLAGS += -arch $(HOST_ARCH)
|
||||||
|
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
|
||||||
|
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
|
||||||
|
CCFLAGS += -mfloat-abi=hard
|
||||||
|
else ifeq ($(TARGET_OS),android)
|
||||||
|
LDFLAGS += -pie
|
||||||
|
CCFLAGS += -fpie -fpic -fexceptions
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||||
|
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||||
|
ifneq ($(TARGET_FS),)
|
||||||
|
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||||
|
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||||
|
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||||
|
endif
|
||||||
|
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||||
|
ifneq ($(TARGET_FS),)
|
||||||
|
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||||
|
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||||
|
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||||
|
endif
|
||||||
|
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
|
||||||
|
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
|
||||||
|
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
|
||||||
|
CCFLAGS += -isystem=$(TARGET_FS)/usr/include
|
||||||
|
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET_OS),qnx)
|
||||||
|
CCFLAGS += -DWIN_INTERFACE_CUSTOM
|
||||||
|
LDFLAGS += -lsocket
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Install directory of different arch
|
||||||
|
CUDA_INSTALL_TARGET_DIR :=
|
||||||
|
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
|
||||||
|
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
|
||||||
|
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||||
|
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Debug build flags
|
||||||
|
ifeq ($(dbg),1)
|
||||||
|
NVCCFLAGS += -g -G
|
||||||
|
BUILD_TYPE := debug
|
||||||
|
else
|
||||||
|
BUILD_TYPE := release
|
||||||
|
endif
|
||||||
|
|
||||||
|
ALL_CCFLAGS :=
|
||||||
|
ALL_CCFLAGS += $(NVCCFLAGS)
|
||||||
|
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
|
||||||
|
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
|
||||||
|
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||||
|
|
||||||
|
SAMPLE_ENABLED := 1
|
||||||
|
|
||||||
|
# This sample is not supported on Mac OSX
|
||||||
|
ifeq ($(TARGET_OS),darwin)
|
||||||
|
$(info >>> WARNING - nvJPEG is not supported on Mac OSX - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
|
# This sample is not supported on ARMv7
|
||||||
|
ifeq ($(TARGET_ARCH),armv7l)
|
||||||
|
$(info >>> WARNING - nvJPEG is not supported on ARMv7 - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
|
# This sample is not supported on aarch64
|
||||||
|
ifeq ($(TARGET_ARCH),aarch64)
|
||||||
|
$(info >>> WARNING - nvJPEG is not supported on aarch64 - waiving sample <<<)
|
||||||
|
SAMPLE_ENABLED := 0
|
||||||
|
endif
|
||||||
|
|
||||||
|
ALL_LDFLAGS :=
|
||||||
|
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||||
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||||
|
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
|
||||||
|
|
||||||
|
# Common includes and paths for CUDA
|
||||||
|
INCLUDES := -I../../Common
|
||||||
|
LIBRARIES :=
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
LIBRARIES += -lnvjpeg
|
||||||
|
|
||||||
|
ifeq ($(SAMPLE_ENABLED),0)
|
||||||
|
EXEC ?= @echo "[@]"
|
||||||
|
endif
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Target rules
|
||||||
|
all: build
|
||||||
|
|
||||||
|
build: nvJPEG
|
||||||
|
|
||||||
|
check.deps:
|
||||||
|
ifeq ($(SAMPLE_ENABLED),0)
|
||||||
|
@echo "Sample will be waived due to the above missing dependencies"
|
||||||
|
else
|
||||||
|
@echo "Sample is ready - all dependencies have been met"
|
||||||
|
endif
|
||||||
|
|
||||||
|
nvJPEG.o:nvJPEG.cpp
|
||||||
|
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
|
||||||
|
|
||||||
|
nvJPEG: nvJPEG.o
|
||||||
|
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
|
||||||
|
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||||
|
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||||
|
|
||||||
|
run: build
|
||||||
|
$(EXEC) ./nvJPEG
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f nvJPEG nvJPEG.o
|
||||||
|
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/nvJPEG
|
||||||
|
|
||||||
|
clobber: clean
|
58
Samples/nvJPEG/NsightEclipse.xml
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
|
||||||
|
<entry>
|
||||||
|
<name>nvJPEG</name>
|
||||||
|
<description><![CDATA[A CUDA Sample that demonstrates single and batched decoding of jpeg images using NVJPEG Library.]]></description>
|
||||||
|
<devicecompilation>whole</devicecompilation>
|
||||||
|
<includepaths>
|
||||||
|
<path>./</path>
|
||||||
|
<path>../</path>
|
||||||
|
<path>../../common/inc</path>
|
||||||
|
</includepaths>
|
||||||
|
<keyconcepts>
|
||||||
|
<concept level="basic">Image Decoding</concept>
|
||||||
|
<concept level="basic">NVJPEG Library</concept>
|
||||||
|
</keyconcepts>
|
||||||
|
<keywords>
|
||||||
|
<keyword>NVJPEG</keyword>
|
||||||
|
<keyword>JPEG Decoding</keyword>
|
||||||
|
</keywords>
|
||||||
|
<libraries>
|
||||||
|
<library>nvjpeg</library>
|
||||||
|
</libraries>
|
||||||
|
<librarypaths>
|
||||||
|
</librarypaths>
|
||||||
|
<nsight_eclipse>true</nsight_eclipse>
|
||||||
|
<primary_file>nvJPEG.cpp</primary_file>
|
||||||
|
<qatests>
|
||||||
|
<qatest>-i ../../../../Samples/nvJPEG/images/</qatest>
|
||||||
|
</qatests>
|
||||||
|
<required_dependencies>
|
||||||
|
<dependency>NVJPEG</dependency>
|
||||||
|
</required_dependencies>
|
||||||
|
<scopes>
|
||||||
|
<scope>1:CUDA Basic Topics</scope>
|
||||||
|
<scope>3:JPEG Decoding</scope>
|
||||||
|
</scopes>
|
||||||
|
<sm-arch>sm30</sm-arch>
|
||||||
|
<sm-arch>sm35</sm-arch>
|
||||||
|
<sm-arch>sm37</sm-arch>
|
||||||
|
<sm-arch>sm50</sm-arch>
|
||||||
|
<sm-arch>sm52</sm-arch>
|
||||||
|
<sm-arch>sm60</sm-arch>
|
||||||
|
<sm-arch>sm61</sm-arch>
|
||||||
|
<sm-arch>sm70</sm-arch>
|
||||||
|
<sm-arch>sm72</sm-arch>
|
||||||
|
<sm-arch>sm75</sm-arch>
|
||||||
|
<supported_envs>
|
||||||
|
<env>
|
||||||
|
<arch>x86_64</arch>
|
||||||
|
<platform>linux</platform>
|
||||||
|
</env>
|
||||||
|
</supported_envs>
|
||||||
|
<supported_sm_architectures>
|
||||||
|
<from>3.0</from>
|
||||||
|
</supported_sm_architectures>
|
||||||
|
<title>NVJPEG simple</title>
|
||||||
|
<type>exe</type>
|
||||||
|
</entry>
|
61
Samples/nvJPEG/README.md
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
# nvJPEG - NVJPEG simple
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
A CUDA Sample that demonstrates single and batched decoding of jpeg images using NVJPEG Library.
|
||||||
|
|
||||||
|
## Key Concepts
|
||||||
|
|
||||||
|
Image Decoding, NVJPEG Library
|
||||||
|
|
||||||
|
## Supported SM Architectures
|
||||||
|
|
||||||
|
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
|
## Supported OSes
|
||||||
|
|
||||||
|
Linux
|
||||||
|
|
||||||
|
## Supported CPU Architecture
|
||||||
|
|
||||||
|
x86_64
|
||||||
|
|
||||||
|
## CUDA APIs involved
|
||||||
|
|
||||||
|
## Dependencies needed to build/run
|
||||||
|
[NVJPEG](../../README.md#nvjpeg)
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
|
## Build and Run
|
||||||
|
|
||||||
|
### Linux
|
||||||
|
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
|
||||||
|
```
|
||||||
|
$ cd <sample_dir>
|
||||||
|
$ make
|
||||||
|
```
|
||||||
|
The samples makefiles can take advantage of certain options:
|
||||||
|
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64.
|
||||||
|
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
|
||||||
|
`$ make TARGET_ARCH=x86_64` <br/>
|
||||||
|
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
|
||||||
|
* **dbg=1** - build with debug symbols
|
||||||
|
```
|
||||||
|
$ make dbg=1
|
||||||
|
```
|
||||||
|
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
|
||||||
|
```
|
||||||
|
$ make SMS="50 60"
|
||||||
|
```
|
||||||
|
|
||||||
|
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
|
||||||
|
```
|
||||||
|
$ make HOST_COMPILER=g++
|
||||||
|
```
|
||||||
|
|
||||||
|
## References (for more details)
|
||||||
|
|
BIN
Samples/nvJPEG/images/img1.jpg
Normal file
After Width: | Height: | Size: 66 KiB |
BIN
Samples/nvJPEG/images/img2.jpg
Normal file
After Width: | Height: | Size: 50 KiB |
BIN
Samples/nvJPEG/images/img3.jpg
Normal file
After Width: | Height: | Size: 34 KiB |
BIN
Samples/nvJPEG/images/img4.jpg
Normal file
After Width: | Height: | Size: 30 KiB |
BIN
Samples/nvJPEG/images/img5.jpg
Normal file
After Width: | Height: | Size: 80 KiB |
BIN
Samples/nvJPEG/images/img6.jpg
Normal file
After Width: | Height: | Size: 63 KiB |
BIN
Samples/nvJPEG/images/img7.jpg
Normal file
After Width: | Height: | Size: 92 KiB |
BIN
Samples/nvJPEG/images/img8.jpg
Normal file
After Width: | Height: | Size: 52 KiB |
559
Samples/nvJPEG/nvJPEG.cpp
Normal file
|
@ -0,0 +1,559 @@
|
||||||
|
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived
|
||||||
|
* from this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This sample needs at least CUDA 10.0. It demonstrates usages of the nvJPEG
|
||||||
|
// library nvJPEG supports single and multiple image(batched) decode. Multiple
|
||||||
|
// images can be decoded using the API for batch mode
|
||||||
|
|
||||||
|
#include <cuda_runtime_api.h>
|
||||||
|
#include "nvJPEG_helper.hxx"
|
||||||
|
|
||||||
|
int dev_malloc(void **p, size_t s) { return (int)cudaMalloc(p, s); }
|
||||||
|
|
||||||
|
int dev_free(void *p) { return (int)cudaFree(p); }
|
||||||
|
|
||||||
|
typedef std::vector<std::string> FileNames;
|
||||||
|
typedef std::vector<std::vector<char> > FileData;
|
||||||
|
|
||||||
|
struct decode_params_t {
|
||||||
|
std::string input_dir;
|
||||||
|
int batch_size;
|
||||||
|
int total_images;
|
||||||
|
int dev;
|
||||||
|
int warmup;
|
||||||
|
|
||||||
|
nvjpegJpegState_t nvjpeg_state;
|
||||||
|
nvjpegHandle_t nvjpeg_handle;
|
||||||
|
cudaStream_t stream;
|
||||||
|
|
||||||
|
nvjpegOutputFormat_t fmt;
|
||||||
|
bool write_decoded;
|
||||||
|
std::string output_dir;
|
||||||
|
|
||||||
|
bool pipelined;
|
||||||
|
bool batched;
|
||||||
|
};
|
||||||
|
|
||||||
|
int read_next_batch(FileNames &image_names, int batch_size,
|
||||||
|
FileNames::iterator &cur_iter, FileData &raw_data,
|
||||||
|
std::vector<size_t> &raw_len, FileNames ¤t_names) {
|
||||||
|
int counter = 0;
|
||||||
|
|
||||||
|
while (counter < batch_size) {
|
||||||
|
if (cur_iter == image_names.end()) {
|
||||||
|
std::cerr << "Image list is too short to fill the batch, adding files "
|
||||||
|
"from the beginning of the image list"
|
||||||
|
<< std::endl;
|
||||||
|
cur_iter = image_names.begin();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (image_names.size() == 0) {
|
||||||
|
std::cerr << "No valid images left in the input list, exit" << std::endl;
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read an image from disk.
|
||||||
|
std::ifstream input(cur_iter->c_str(),
|
||||||
|
std::ios::in | std::ios::binary | std::ios::ate);
|
||||||
|
if (!(input.is_open())) {
|
||||||
|
std::cerr << "Cannot open image: " << *cur_iter
|
||||||
|
<< ", removing it from image list" << std::endl;
|
||||||
|
image_names.erase(cur_iter);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the size
|
||||||
|
std::streamsize file_size = input.tellg();
|
||||||
|
input.seekg(0, std::ios::beg);
|
||||||
|
// resize if buffer is too small
|
||||||
|
if (raw_data[counter].size() < file_size) {
|
||||||
|
raw_data[counter].resize(file_size);
|
||||||
|
}
|
||||||
|
if (!input.read(raw_data[counter].data(), file_size)) {
|
||||||
|
std::cerr << "Cannot read from file: " << *cur_iter
|
||||||
|
<< ", removing it from image list" << std::endl;
|
||||||
|
image_names.erase(cur_iter);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
raw_len[counter] = file_size;
|
||||||
|
|
||||||
|
current_names[counter] = *cur_iter;
|
||||||
|
|
||||||
|
counter++;
|
||||||
|
cur_iter++;
|
||||||
|
}
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
// prepare buffers for RGBi output format
|
||||||
|
int prepare_buffers(FileData &file_data, std::vector<size_t> &file_len,
|
||||||
|
std::vector<int> &img_width, std::vector<int> &img_height,
|
||||||
|
std::vector<nvjpegImage_t> &ibuf,
|
||||||
|
std::vector<nvjpegImage_t> &isz, FileNames ¤t_names,
|
||||||
|
decode_params_t ¶ms) {
|
||||||
|
int widths[NVJPEG_MAX_COMPONENT];
|
||||||
|
int heights[NVJPEG_MAX_COMPONENT];
|
||||||
|
int channels;
|
||||||
|
nvjpegChromaSubsampling_t subsampling;
|
||||||
|
|
||||||
|
for (int i = 0; i < file_data.size(); i++) {
|
||||||
|
checkCudaErrors(nvjpegGetImageInfo(
|
||||||
|
params.nvjpeg_handle, (unsigned char *)file_data[i].data(), file_len[i],
|
||||||
|
&channels, &subsampling, widths, heights));
|
||||||
|
|
||||||
|
img_width[i] = widths[0];
|
||||||
|
img_height[i] = heights[0];
|
||||||
|
|
||||||
|
std::cout << "Processing: " << current_names[i] << std::endl;
|
||||||
|
std::cout << "Image is " << channels << " channels." << std::endl;
|
||||||
|
for (int c = 0; c < channels; c++) {
|
||||||
|
std::cout << "Channel #" << c << " size: " << widths[c] << " x "
|
||||||
|
<< heights[c] << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (subsampling) {
|
||||||
|
case NVJPEG_CSS_444:
|
||||||
|
std::cout << "YUV 4:4:4 chroma subsampling" << std::endl;
|
||||||
|
break;
|
||||||
|
case NVJPEG_CSS_440:
|
||||||
|
std::cout << "YUV 4:4:0 chroma subsampling" << std::endl;
|
||||||
|
break;
|
||||||
|
case NVJPEG_CSS_422:
|
||||||
|
std::cout << "YUV 4:2:2 chroma subsampling" << std::endl;
|
||||||
|
break;
|
||||||
|
case NVJPEG_CSS_420:
|
||||||
|
std::cout << "YUV 4:2:0 chroma subsampling" << std::endl;
|
||||||
|
break;
|
||||||
|
case NVJPEG_CSS_411:
|
||||||
|
std::cout << "YUV 4:1:1 chroma subsampling" << std::endl;
|
||||||
|
break;
|
||||||
|
case NVJPEG_CSS_410:
|
||||||
|
std::cout << "YUV 4:1:0 chroma subsampling" << std::endl;
|
||||||
|
break;
|
||||||
|
case NVJPEG_CSS_GRAY:
|
||||||
|
std::cout << "Grayscale JPEG " << std::endl;
|
||||||
|
break;
|
||||||
|
case NVJPEG_CSS_UNKNOWN:
|
||||||
|
std::cout << "Unknown chroma subsampling" << std::endl;
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
|
||||||
|
int mul = 1;
|
||||||
|
// in the case of interleaved RGB output, write only to single channel, but
|
||||||
|
// 3 samples at once
|
||||||
|
if (params.fmt == NVJPEG_OUTPUT_RGBI || params.fmt == NVJPEG_OUTPUT_BGRI) {
|
||||||
|
channels = 1;
|
||||||
|
mul = 3;
|
||||||
|
}
|
||||||
|
// in the case of rgb create 3 buffers with sizes of original image
|
||||||
|
else if (params.fmt == NVJPEG_OUTPUT_RGB ||
|
||||||
|
params.fmt == NVJPEG_OUTPUT_BGR) {
|
||||||
|
channels = 3;
|
||||||
|
widths[1] = widths[2] = widths[0];
|
||||||
|
heights[1] = heights[2] = heights[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// realloc output buffer if required
|
||||||
|
for (int c = 0; c < channels; c++) {
|
||||||
|
int aw = mul * widths[c];
|
||||||
|
int ah = heights[c];
|
||||||
|
int sz = aw * ah;
|
||||||
|
ibuf[i].pitch[c] = aw;
|
||||||
|
if (sz > isz[i].pitch[c]) {
|
||||||
|
if (ibuf[i].channel[c]) {
|
||||||
|
checkCudaErrors(cudaFree(ibuf[i].channel[c]));
|
||||||
|
}
|
||||||
|
checkCudaErrors(cudaMalloc(&ibuf[i].channel[c], sz));
|
||||||
|
isz[i].pitch[c] = sz;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
void release_buffers(std::vector<nvjpegImage_t> &ibuf) {
|
||||||
|
for (int i = 0; i < ibuf.size(); i++) {
|
||||||
|
for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++)
|
||||||
|
if (ibuf[i].channel[c]) checkCudaErrors(cudaFree(ibuf[i].channel[c]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int decode_images(const FileData &img_data, const std::vector<size_t> &img_len,
|
||||||
|
std::vector<nvjpegImage_t> &out, decode_params_t ¶ms,
|
||||||
|
double &time) {
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(params.stream));
|
||||||
|
nvjpegStatus_t err;
|
||||||
|
StopWatchInterface *timer = NULL;
|
||||||
|
sdkCreateTimer(&timer);
|
||||||
|
|
||||||
|
if (!params.batched) {
|
||||||
|
if (!params.pipelined) // decode one image at a time
|
||||||
|
{
|
||||||
|
int thread_idx = 0;
|
||||||
|
sdkStartTimer(&timer);
|
||||||
|
for (int i = 0; i < params.batch_size; i++) {
|
||||||
|
checkCudaErrors(nvjpegDecode(params.nvjpeg_handle, params.nvjpeg_state,
|
||||||
|
(const unsigned char *)img_data[i].data(),
|
||||||
|
img_len[i], params.fmt, &out[i],
|
||||||
|
params.stream));
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(params.stream));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int thread_idx = 0;
|
||||||
|
sdkStartTimer(&timer);
|
||||||
|
for (int i = 0; i < params.batch_size; i++) {
|
||||||
|
checkCudaErrors(
|
||||||
|
nvjpegDecodePhaseOne(params.nvjpeg_handle, params.nvjpeg_state,
|
||||||
|
(const unsigned char *)img_data[i].data(),
|
||||||
|
img_len[i], params.fmt, params.stream));
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(params.stream));
|
||||||
|
checkCudaErrors(nvjpegDecodePhaseTwo(
|
||||||
|
params.nvjpeg_handle, params.nvjpeg_state, params.stream));
|
||||||
|
checkCudaErrors(nvjpegDecodePhaseThree(
|
||||||
|
params.nvjpeg_handle, params.nvjpeg_state, &out[i], params.stream));
|
||||||
|
}
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(params.stream));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
std::vector<const unsigned char *> raw_inputs;
|
||||||
|
for (int i = 0; i < params.batch_size; i++) {
|
||||||
|
raw_inputs.push_back((const unsigned char *)img_data[i].data());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!params.pipelined) // decode multiple images in a single batch
|
||||||
|
{
|
||||||
|
sdkStartTimer(&timer);
|
||||||
|
checkCudaErrors(nvjpegDecodeBatched(
|
||||||
|
params.nvjpeg_handle, params.nvjpeg_state, raw_inputs.data(),
|
||||||
|
img_len.data(), out.data(), params.stream));
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(params.stream));
|
||||||
|
} else {
|
||||||
|
int thread_idx = 0;
|
||||||
|
for (int i = 0; i < params.batch_size; i++) {
|
||||||
|
checkCudaErrors(nvjpegDecodeBatchedPhaseOne(
|
||||||
|
params.nvjpeg_handle, params.nvjpeg_state, raw_inputs[i],
|
||||||
|
img_len[i], i, thread_idx, params.stream));
|
||||||
|
}
|
||||||
|
checkCudaErrors(nvjpegDecodeBatchedPhaseTwo(
|
||||||
|
params.nvjpeg_handle, params.nvjpeg_state, params.stream));
|
||||||
|
checkCudaErrors(nvjpegDecodeBatchedPhaseThree(params.nvjpeg_handle,
|
||||||
|
params.nvjpeg_state,
|
||||||
|
out.data(), params.stream));
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(params.stream));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sdkStopTimer(&timer);
|
||||||
|
time = sdkGetAverageTimerValue(&timer)/1000.0f;
|
||||||
|
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
int write_images(std::vector<nvjpegImage_t> &iout, std::vector<int> &widths,
|
||||||
|
std::vector<int> &heights, decode_params_t ¶ms,
|
||||||
|
FileNames &filenames) {
|
||||||
|
for (int i = 0; i < params.batch_size; i++) {
|
||||||
|
// Get the file name, without extension.
|
||||||
|
// This will be used to rename the output file.
|
||||||
|
size_t position = filenames[i].rfind("/");
|
||||||
|
std::string sFileName =
|
||||||
|
(std::string::npos == position)
|
||||||
|
? filenames[i]
|
||||||
|
: filenames[i].substr(position + 1, filenames[i].size());
|
||||||
|
position = sFileName.rfind(".");
|
||||||
|
sFileName = (std::string::npos == position) ? sFileName
|
||||||
|
: sFileName.substr(0, position);
|
||||||
|
std::string fname(params.output_dir + "/" + sFileName + ".bmp");
|
||||||
|
|
||||||
|
int err;
|
||||||
|
if (params.fmt == NVJPEG_OUTPUT_RGB || params.fmt == NVJPEG_OUTPUT_BGR) {
|
||||||
|
err = writeBMP(fname.c_str(), iout[i].channel[0], iout[i].pitch[0],
|
||||||
|
iout[i].channel[1], iout[i].pitch[1], iout[i].channel[2],
|
||||||
|
iout[i].pitch[2], widths[i], heights[i]);
|
||||||
|
} else if (params.fmt == NVJPEG_OUTPUT_RGBI ||
|
||||||
|
params.fmt == NVJPEG_OUTPUT_BGRI) {
|
||||||
|
// Write BMP from interleaved data
|
||||||
|
err = writeBMPi(fname.c_str(), iout[i].channel[0], iout[i].pitch[0],
|
||||||
|
widths[i], heights[i]);
|
||||||
|
}
|
||||||
|
if (err) {
|
||||||
|
std::cout << "Cannot write output file: " << fname << std::endl;
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
std::cout << "Done writing decoded image to file: " << fname << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double process_images(FileNames &image_names, decode_params_t ¶ms,
|
||||||
|
double &total) {
|
||||||
|
// vector for storing raw files and file lengths
|
||||||
|
FileData file_data(params.batch_size);
|
||||||
|
std::vector<size_t> file_len(params.batch_size);
|
||||||
|
FileNames current_names(params.batch_size);
|
||||||
|
std::vector<int> widths(params.batch_size);
|
||||||
|
std::vector<int> heights(params.batch_size);
|
||||||
|
// we wrap over image files to process total_images of files
|
||||||
|
FileNames::iterator file_iter = image_names.begin();
|
||||||
|
|
||||||
|
// stream for decoding
|
||||||
|
checkCudaErrors(
|
||||||
|
cudaStreamCreateWithFlags(¶ms.stream, cudaStreamNonBlocking));
|
||||||
|
|
||||||
|
int total_processed = 0;
|
||||||
|
|
||||||
|
// output buffers
|
||||||
|
std::vector<nvjpegImage_t> iout(params.batch_size);
|
||||||
|
// output buffer sizes, for convenience
|
||||||
|
std::vector<nvjpegImage_t> isz(params.batch_size);
|
||||||
|
|
||||||
|
for (int i = 0; i < iout.size(); i++) {
|
||||||
|
for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) {
|
||||||
|
iout[i].channel[c] = NULL;
|
||||||
|
iout[i].pitch[c] = 0;
|
||||||
|
isz[i].pitch[c] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double test_time = 0;
|
||||||
|
int warmup = 0;
|
||||||
|
while (total_processed < params.total_images) {
|
||||||
|
if (read_next_batch(image_names, params.batch_size, file_iter, file_data,
|
||||||
|
file_len, current_names))
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
|
||||||
|
if (prepare_buffers(file_data, file_len, widths, heights, iout, isz,
|
||||||
|
current_names, params))
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
|
||||||
|
double time;
|
||||||
|
if (decode_images(file_data, file_len, iout, params, time))
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
if (warmup < params.warmup) {
|
||||||
|
warmup++;
|
||||||
|
} else {
|
||||||
|
total_processed += params.batch_size;
|
||||||
|
test_time += time;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.write_decoded)
|
||||||
|
write_images(iout, widths, heights, params, current_names);
|
||||||
|
}
|
||||||
|
total = test_time;
|
||||||
|
|
||||||
|
release_buffers(iout);
|
||||||
|
|
||||||
|
checkCudaErrors(cudaStreamDestroy(params.stream));
|
||||||
|
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse parameters
|
||||||
|
int findParamIndex(const char **argv, int argc, const char *parm) {
|
||||||
|
int count = 0;
|
||||||
|
int index = -1;
|
||||||
|
|
||||||
|
for (int i = 0; i < argc; i++) {
|
||||||
|
if (strncmp(argv[i], parm, 100) == 0) {
|
||||||
|
index = i;
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (count == 0 || count == 1) {
|
||||||
|
return index;
|
||||||
|
} else {
|
||||||
|
std::cout << "Error, parameter " << parm
|
||||||
|
<< " has been specified more than once, exiting\n"
|
||||||
|
<< std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, const char *argv[]) {
|
||||||
|
int pidx;
|
||||||
|
|
||||||
|
if ((pidx = findParamIndex(argv, argc, "-h")) != -1 ||
|
||||||
|
(pidx = findParamIndex(argv, argc, "--help")) != -1) {
|
||||||
|
std::cout << "Usage: " << argv[0]
|
||||||
|
<< " -i images_dir [-b batch_size] [-t total_images] [-device= "
|
||||||
|
"device_id] [-w warmup_iterations] [-o output_dir] "
|
||||||
|
"[-pipelined] [-batched] [-fmt output_format]\n";
|
||||||
|
std::cout << "Parameters: " << std::endl;
|
||||||
|
std::cout << "\timages_dir\t:\tPath to single image or directory of images"
|
||||||
|
<< std::endl;
|
||||||
|
std::cout << "\tbatch_size\t:\tDecode images from input by batches of "
|
||||||
|
"specified size"
|
||||||
|
<< std::endl;
|
||||||
|
std::cout << "\ttotal_images\t:\tDecode this much images, if there are "
|
||||||
|
"less images \n"
|
||||||
|
<< "\t\t\t\t\tin the input than total images, decoder will loop "
|
||||||
|
"over the input"
|
||||||
|
<< std::endl;
|
||||||
|
std::cout << "\tdevice_id\t:\tWhich device to use for decoding"
|
||||||
|
<< std::endl;
|
||||||
|
std::cout << "\twarmup_iterations\t:\tRun this amount of batches first "
|
||||||
|
"without measuring performance"
|
||||||
|
<< std::endl;
|
||||||
|
std::cout
|
||||||
|
<< "\toutput_dir\t:\tWrite decoded images as BMPs to this directory"
|
||||||
|
<< std::endl;
|
||||||
|
std::cout << "\tpipelined\t:\tUse decoding in phases" << std::endl;
|
||||||
|
std::cout << "\tbatched\t\t:\tUse batched interface" << std::endl;
|
||||||
|
std::cout << "\toutput_format\t:\tnvJPEG output format for decoding. One "
|
||||||
|
"of [rgb, rgbi, bgr, bgri, yuv, y, unchanged]"
|
||||||
|
<< std::endl;
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
decode_params_t params;
|
||||||
|
|
||||||
|
params.input_dir = "./";
|
||||||
|
if ((pidx = findParamIndex(argv, argc, "-i")) != -1) {
|
||||||
|
params.input_dir = argv[pidx + 1];
|
||||||
|
} else {
|
||||||
|
std::cerr << "Please specify input directory with encoded images"
|
||||||
|
<< std::endl;
|
||||||
|
return EXIT_WAIVED;
|
||||||
|
}
|
||||||
|
|
||||||
|
params.batch_size = 1;
|
||||||
|
if ((pidx = findParamIndex(argv, argc, "-b")) != -1) {
|
||||||
|
params.batch_size = std::atoi(argv[pidx + 1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
params.total_images = -1;
|
||||||
|
if ((pidx = findParamIndex(argv, argc, "-t")) != -1) {
|
||||||
|
params.total_images = std::atoi(argv[pidx + 1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
params.dev = 0;
|
||||||
|
params.dev = findCudaDevice(argc, argv);
|
||||||
|
|
||||||
|
params.warmup = 0;
|
||||||
|
if ((pidx = findParamIndex(argv, argc, "-w")) != -1) {
|
||||||
|
params.warmup = std::atoi(argv[pidx + 1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
params.batched = false;
|
||||||
|
if ((pidx = findParamIndex(argv, argc, "-batched")) != -1) {
|
||||||
|
params.batched = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
params.pipelined = false;
|
||||||
|
if ((pidx = findParamIndex(argv, argc, "-pipelined")) != -1) {
|
||||||
|
params.pipelined = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
params.fmt = NVJPEG_OUTPUT_RGB;
|
||||||
|
if ((pidx = findParamIndex(argv, argc, "-fmt")) != -1) {
|
||||||
|
std::string sfmt = argv[pidx + 1];
|
||||||
|
if (sfmt == "rgb")
|
||||||
|
params.fmt = NVJPEG_OUTPUT_RGB;
|
||||||
|
else if (sfmt == "bgr")
|
||||||
|
params.fmt = NVJPEG_OUTPUT_BGR;
|
||||||
|
else if (sfmt == "rgbi")
|
||||||
|
params.fmt = NVJPEG_OUTPUT_RGBI;
|
||||||
|
else if (sfmt == "bgri")
|
||||||
|
params.fmt = NVJPEG_OUTPUT_BGRI;
|
||||||
|
else if (sfmt == "yuv")
|
||||||
|
params.fmt = NVJPEG_OUTPUT_YUV;
|
||||||
|
else if (sfmt == "y")
|
||||||
|
params.fmt = NVJPEG_OUTPUT_Y;
|
||||||
|
else if (sfmt == "unchanged")
|
||||||
|
params.fmt = NVJPEG_OUTPUT_UNCHANGED;
|
||||||
|
else {
|
||||||
|
std::cout << "Unknown format: " << sfmt << std::endl;
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
params.write_decoded = false;
|
||||||
|
if ((pidx = findParamIndex(argv, argc, "-o")) != -1) {
|
||||||
|
params.output_dir = argv[pidx + 1];
|
||||||
|
if (params.fmt != NVJPEG_OUTPUT_RGB && params.fmt != NVJPEG_OUTPUT_BGR &&
|
||||||
|
params.fmt != NVJPEG_OUTPUT_RGBI && params.fmt != NVJPEG_OUTPUT_BGRI) {
|
||||||
|
std::cout << "We can write ony BMPs, which require output format be "
|
||||||
|
"either RGB/BGR or RGBi/BGRi"
|
||||||
|
<< std::endl;
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
params.write_decoded = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaDeviceProp props;
|
||||||
|
checkCudaErrors(cudaGetDeviceProperties(&props, params.dev));
|
||||||
|
|
||||||
|
printf("Using GPU %d (%s, %d SMs, %d th/SM max, CC %d.%d, ECC %s)\n",
|
||||||
|
params.dev, props.name, props.multiProcessorCount,
|
||||||
|
props.maxThreadsPerMultiProcessor, props.major, props.minor,
|
||||||
|
props.ECCEnabled ? "on" : "off");
|
||||||
|
|
||||||
|
nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free};
|
||||||
|
checkCudaErrors(nvjpegCreate(NVJPEG_BACKEND_DEFAULT, &dev_allocator,
|
||||||
|
¶ms.nvjpeg_handle));
|
||||||
|
checkCudaErrors(
|
||||||
|
nvjpegJpegStateCreate(params.nvjpeg_handle, ¶ms.nvjpeg_state));
|
||||||
|
checkCudaErrors(
|
||||||
|
nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state,
|
||||||
|
params.batch_size, 1, params.fmt));
|
||||||
|
|
||||||
|
// read source images
|
||||||
|
FileNames image_names;
|
||||||
|
readInput(params.input_dir, image_names);
|
||||||
|
|
||||||
|
if (params.total_images == -1) {
|
||||||
|
params.total_images = image_names.size();
|
||||||
|
} else if (params.total_images % params.batch_size) {
|
||||||
|
params.total_images =
|
||||||
|
((params.total_images) / params.batch_size) * params.batch_size;
|
||||||
|
std::cout << "Changing total_images number to " << params.total_images
|
||||||
|
<< " to be multiple of batch_size - " << params.batch_size
|
||||||
|
<< std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "Decoding images in directory: " << params.input_dir
|
||||||
|
<< ", total " << params.total_images << ", batchsize "
|
||||||
|
<< params.batch_size << std::endl;
|
||||||
|
|
||||||
|
double total;
|
||||||
|
if (process_images(image_names, params, total)) return EXIT_FAILURE;
|
||||||
|
std::cout << "Total decoding time: " << total << std::endl;
|
||||||
|
std::cout << "Avg decoding time per image: " << total / params.total_images
|
||||||
|
<< std::endl;
|
||||||
|
std::cout << "Avg images per sec: " << params.total_images / total
|
||||||
|
<< std::endl;
|
||||||
|
std::cout << "Avg decoding time per batch: "
|
||||||
|
<< total / ((params.total_images + params.batch_size - 1) /
|
||||||
|
params.batch_size)
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
|
checkCudaErrors(nvjpegJpegStateDestroy(params.nvjpeg_state));
|
||||||
|
checkCudaErrors(nvjpegDestroy(params.nvjpeg_handle));
|
||||||
|
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
338
Samples/nvJPEG/nvJPEG_helper.hxx
Normal file
|
@ -0,0 +1,338 @@
|
||||||
|
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived
|
||||||
|
* from this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This sample needs at least CUDA 10.0.
|
||||||
|
// It demonstrates usages of the nvJPEG library
|
||||||
|
|
||||||
|
#ifndef NV_JPEG_EXAMPLE
|
||||||
|
#define NV_JPEG_EXAMPLE
|
||||||
|
|
||||||
|
#include "cuda_runtime.h"
|
||||||
|
#include "nvjpeg.h"
|
||||||
|
#include "helper_cuda.h"
|
||||||
|
#include "helper_timer.h"
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <string.h> // strcmpi
|
||||||
|
#include <sys/time.h> // timings
|
||||||
|
|
||||||
|
#include <dirent.h> // linux dir traverse
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
// write bmp, input - RGB, device
|
||||||
|
int writeBMP(const char *filename, const unsigned char *d_chanR, int pitchR,
|
||||||
|
const unsigned char *d_chanG, int pitchG,
|
||||||
|
const unsigned char *d_chanB, int pitchB, int width, int height) {
|
||||||
|
unsigned int headers[13];
|
||||||
|
FILE *outfile;
|
||||||
|
int extrabytes;
|
||||||
|
int paddedsize;
|
||||||
|
int x;
|
||||||
|
int y;
|
||||||
|
int n;
|
||||||
|
int red, green, blue;
|
||||||
|
|
||||||
|
std::vector<unsigned char> vchanR(height * width);
|
||||||
|
std::vector<unsigned char> vchanG(height * width);
|
||||||
|
std::vector<unsigned char> vchanB(height * width);
|
||||||
|
unsigned char *chanR = vchanR.data();
|
||||||
|
unsigned char *chanG = vchanG.data();
|
||||||
|
unsigned char *chanB = vchanB.data();
|
||||||
|
checkCudaErrors(cudaMemcpy2D(chanR, (size_t)width, d_chanR, (size_t)pitchR,
|
||||||
|
width, height, cudaMemcpyDeviceToHost));
|
||||||
|
checkCudaErrors(cudaMemcpy2D(chanG, (size_t)width, d_chanG, (size_t)pitchR,
|
||||||
|
width, height, cudaMemcpyDeviceToHost));
|
||||||
|
checkCudaErrors(cudaMemcpy2D(chanB, (size_t)width, d_chanB, (size_t)pitchR,
|
||||||
|
width, height, cudaMemcpyDeviceToHost));
|
||||||
|
|
||||||
|
extrabytes =
|
||||||
|
4 - ((width * 3) % 4); // How many bytes of padding to add to each
|
||||||
|
// horizontal line - the size of which must
|
||||||
|
// be a multiple of 4 bytes.
|
||||||
|
if (extrabytes == 4) extrabytes = 0;
|
||||||
|
|
||||||
|
paddedsize = ((width * 3) + extrabytes) * height;
|
||||||
|
|
||||||
|
// Headers...
|
||||||
|
// Note that the "BM" identifier in bytes 0 and 1 is NOT included in these
|
||||||
|
// "headers".
|
||||||
|
|
||||||
|
headers[0] = paddedsize + 54; // bfSize (whole file size)
|
||||||
|
headers[1] = 0; // bfReserved (both)
|
||||||
|
headers[2] = 54; // bfOffbits
|
||||||
|
headers[3] = 40; // biSize
|
||||||
|
headers[4] = width; // biWidth
|
||||||
|
headers[5] = height; // biHeight
|
||||||
|
|
||||||
|
// Would have biPlanes and biBitCount in position 6, but they're shorts.
|
||||||
|
// It's easier to write them out separately (see below) than pretend
|
||||||
|
// they're a single int, especially with endian issues...
|
||||||
|
|
||||||
|
headers[7] = 0; // biCompression
|
||||||
|
headers[8] = paddedsize; // biSizeImage
|
||||||
|
headers[9] = 0; // biXPelsPerMeter
|
||||||
|
headers[10] = 0; // biYPelsPerMeter
|
||||||
|
headers[11] = 0; // biClrUsed
|
||||||
|
headers[12] = 0; // biClrImportant
|
||||||
|
|
||||||
|
if (!(outfile = fopen(filename, "wb"))) {
|
||||||
|
std::cerr << "Cannot open file: " << filename << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Headers begin...
|
||||||
|
// When printing ints and shorts, we write out 1 character at a time to avoid
|
||||||
|
// endian issues.
|
||||||
|
//
|
||||||
|
fprintf(outfile, "BM");
|
||||||
|
|
||||||
|
for (n = 0; n <= 5; n++) {
|
||||||
|
fprintf(outfile, "%c", headers[n] & 0x000000FF);
|
||||||
|
fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
|
||||||
|
fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
|
||||||
|
fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
|
||||||
|
}
|
||||||
|
|
||||||
|
// These next 4 characters are for the biPlanes and biBitCount fields.
|
||||||
|
|
||||||
|
fprintf(outfile, "%c", 1);
|
||||||
|
fprintf(outfile, "%c", 0);
|
||||||
|
fprintf(outfile, "%c", 24);
|
||||||
|
fprintf(outfile, "%c", 0);
|
||||||
|
|
||||||
|
for (n = 7; n <= 12; n++) {
|
||||||
|
fprintf(outfile, "%c", headers[n] & 0x000000FF);
|
||||||
|
fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
|
||||||
|
fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
|
||||||
|
fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Headers done, now write the data...
|
||||||
|
//
|
||||||
|
|
||||||
|
for (y = height - 1; y >= 0;
|
||||||
|
y--) // BMP image format is written from bottom to top...
|
||||||
|
{
|
||||||
|
for (x = 0; x <= width - 1; x++) {
|
||||||
|
red = chanR[y * width + x];
|
||||||
|
green = chanG[y * width + x];
|
||||||
|
blue = chanB[y * width + x];
|
||||||
|
|
||||||
|
if (red > 255) red = 255;
|
||||||
|
if (red < 0) red = 0;
|
||||||
|
if (green > 255) green = 255;
|
||||||
|
if (green < 0) green = 0;
|
||||||
|
if (blue > 255) blue = 255;
|
||||||
|
if (blue < 0) blue = 0;
|
||||||
|
// Also, it's written in (b,g,r) format...
|
||||||
|
|
||||||
|
fprintf(outfile, "%c", blue);
|
||||||
|
fprintf(outfile, "%c", green);
|
||||||
|
fprintf(outfile, "%c", red);
|
||||||
|
}
|
||||||
|
if (extrabytes) // See above - BMP lines must be of lengths divisible by 4.
|
||||||
|
{
|
||||||
|
for (n = 1; n <= extrabytes; n++) {
|
||||||
|
fprintf(outfile, "%c", 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(outfile);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// write bmp, input - RGB, device
|
||||||
|
int writeBMPi(const char *filename, const unsigned char *d_RGB, int pitch,
|
||||||
|
int width, int height) {
|
||||||
|
unsigned int headers[13];
|
||||||
|
FILE *outfile;
|
||||||
|
int extrabytes;
|
||||||
|
int paddedsize;
|
||||||
|
int x;
|
||||||
|
int y;
|
||||||
|
int n;
|
||||||
|
int red, green, blue;
|
||||||
|
|
||||||
|
std::vector<unsigned char> vchanRGB(height * width * 3);
|
||||||
|
unsigned char *chanRGB = vchanRGB.data();
|
||||||
|
checkCudaErrors(cudaMemcpy2D(chanRGB, (size_t)width * 3, d_RGB, (size_t)pitch,
|
||||||
|
width * 3, height, cudaMemcpyDeviceToHost));
|
||||||
|
|
||||||
|
extrabytes =
|
||||||
|
4 - ((width * 3) % 4); // How many bytes of padding to add to each
|
||||||
|
// horizontal line - the size of which must
|
||||||
|
// be a multiple of 4 bytes.
|
||||||
|
if (extrabytes == 4) extrabytes = 0;
|
||||||
|
|
||||||
|
paddedsize = ((width * 3) + extrabytes) * height;
|
||||||
|
|
||||||
|
// Headers...
|
||||||
|
// Note that the "BM" identifier in bytes 0 and 1 is NOT included in these
|
||||||
|
// "headers".
|
||||||
|
headers[0] = paddedsize + 54; // bfSize (whole file size)
|
||||||
|
headers[1] = 0; // bfReserved (both)
|
||||||
|
headers[2] = 54; // bfOffbits
|
||||||
|
headers[3] = 40; // biSize
|
||||||
|
headers[4] = width; // biWidth
|
||||||
|
headers[5] = height; // biHeight
|
||||||
|
|
||||||
|
// Would have biPlanes and biBitCount in position 6, but they're shorts.
|
||||||
|
// It's easier to write them out separately (see below) than pretend
|
||||||
|
// they're a single int, especially with endian issues...
|
||||||
|
|
||||||
|
headers[7] = 0; // biCompression
|
||||||
|
headers[8] = paddedsize; // biSizeImage
|
||||||
|
headers[9] = 0; // biXPelsPerMeter
|
||||||
|
headers[10] = 0; // biYPelsPerMeter
|
||||||
|
headers[11] = 0; // biClrUsed
|
||||||
|
headers[12] = 0; // biClrImportant
|
||||||
|
|
||||||
|
if (!(outfile = fopen(filename, "wb"))) {
|
||||||
|
std::cerr << "Cannot open file: " << filename << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Headers begin...
|
||||||
|
// When printing ints and shorts, we write out 1 character at a time to avoid
|
||||||
|
// endian issues.
|
||||||
|
//
|
||||||
|
|
||||||
|
fprintf(outfile, "BM");
|
||||||
|
|
||||||
|
for (n = 0; n <= 5; n++) {
|
||||||
|
fprintf(outfile, "%c", headers[n] & 0x000000FF);
|
||||||
|
fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
|
||||||
|
fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
|
||||||
|
fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
|
||||||
|
}
|
||||||
|
|
||||||
|
// These next 4 characters are for the biPlanes and biBitCount fields.
|
||||||
|
|
||||||
|
fprintf(outfile, "%c", 1);
|
||||||
|
fprintf(outfile, "%c", 0);
|
||||||
|
fprintf(outfile, "%c", 24);
|
||||||
|
fprintf(outfile, "%c", 0);
|
||||||
|
|
||||||
|
for (n = 7; n <= 12; n++) {
|
||||||
|
fprintf(outfile, "%c", headers[n] & 0x000000FF);
|
||||||
|
fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
|
||||||
|
fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
|
||||||
|
fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Headers done, now write the data...
|
||||||
|
//
|
||||||
|
for (y = height - 1; y >= 0;
|
||||||
|
y--) // BMP image format is written from bottom to top...
|
||||||
|
{
|
||||||
|
for (x = 0; x <= width - 1; x++) {
|
||||||
|
red = chanRGB[(y * width + x) * 3];
|
||||||
|
green = chanRGB[(y * width + x) * 3 + 1];
|
||||||
|
blue = chanRGB[(y * width + x) * 3 + 2];
|
||||||
|
|
||||||
|
if (red > 255) red = 255;
|
||||||
|
if (red < 0) red = 0;
|
||||||
|
if (green > 255) green = 255;
|
||||||
|
if (green < 0) green = 0;
|
||||||
|
if (blue > 255) blue = 255;
|
||||||
|
if (blue < 0) blue = 0;
|
||||||
|
// Also, it's written in (b,g,r) format...
|
||||||
|
|
||||||
|
fprintf(outfile, "%c", blue);
|
||||||
|
fprintf(outfile, "%c", green);
|
||||||
|
fprintf(outfile, "%c", red);
|
||||||
|
}
|
||||||
|
if (extrabytes) // See above - BMP lines must be of lengths divisible by 4.
|
||||||
|
{
|
||||||
|
for (n = 1; n <= extrabytes; n++) {
|
||||||
|
fprintf(outfile, "%c", 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(outfile);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int readInput(const std::string &sInputPath,
|
||||||
|
std::vector<std::string> &filelist) {
|
||||||
|
int error_code = 1;
|
||||||
|
struct stat s;
|
||||||
|
|
||||||
|
if (stat(sInputPath.c_str(), &s) == 0) {
|
||||||
|
if (s.st_mode & S_IFREG) {
|
||||||
|
filelist.push_back(sInputPath);
|
||||||
|
} else if (s.st_mode & S_IFDIR) {
|
||||||
|
// processing each file in directory
|
||||||
|
DIR *dir_handle;
|
||||||
|
struct dirent *dir;
|
||||||
|
dir_handle = opendir(sInputPath.c_str());
|
||||||
|
std::vector<std::string> filenames;
|
||||||
|
if (dir_handle) {
|
||||||
|
error_code = 0;
|
||||||
|
while ((dir = readdir(dir_handle)) != NULL) {
|
||||||
|
if (dir->d_type == DT_REG) {
|
||||||
|
std::string sFileName = sInputPath + dir->d_name;
|
||||||
|
filelist.push_back(sFileName);
|
||||||
|
} else if (dir->d_type == DT_DIR) {
|
||||||
|
std::string sname = dir->d_name;
|
||||||
|
if (sname != "." && sname != "..") {
|
||||||
|
readInput(sInputPath + sname + "/", filelist);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
closedir(dir_handle);
|
||||||
|
} else {
|
||||||
|
std::cout << "Cannot open input directory: " << sInputPath << std::endl;
|
||||||
|
return error_code;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
std::cout << "Cannot open input: " << sInputPath << std::endl;
|
||||||
|
return error_code;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
std::cout << "Cannot find input path " << sInputPath << std::endl;
|
||||||
|
return error_code;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -246,7 +246,11 @@ LIBRARIES :=
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
# Gencode arguments
|
# Gencode arguments
|
||||||
|
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
|
||||||
|
SMS ?= 30 35 37 50 52 60 61 70 72 75
|
||||||
|
else
|
||||||
SMS ?= 30 35 37 50 52 60 61 70 75
|
SMS ?= 30 35 37 50 52 60 61 70 75
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(SMS),)
|
ifeq ($(SMS),)
|
||||||
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||||
|
|
|
@ -48,6 +48,7 @@
|
||||||
<sm-arch>sm60</sm-arch>
|
<sm-arch>sm60</sm-arch>
|
||||||
<sm-arch>sm61</sm-arch>
|
<sm-arch>sm61</sm-arch>
|
||||||
<sm-arch>sm70</sm-arch>
|
<sm-arch>sm70</sm-arch>
|
||||||
|
<sm-arch>sm72</sm-arch>
|
||||||
<sm-arch>sm75</sm-arch>
|
<sm-arch>sm75</sm-arch>
|
||||||
<supported_envs>
|
<supported_envs>
|
||||||
<env>
|
<env>
|
||||||
|
|
|
@ -10,7 +10,7 @@ Performance Strategies, Asynchronous Data Transfers, Unified Virtual Address Spa
|
||||||
|
|
||||||
## Supported SM Architectures
|
## Supported SM Architectures
|
||||||
|
|
||||||
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
|
||||||
|
|
||||||
## Supported OSes
|
## Supported OSes
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ cudaDeviceCanAccessPeer, cudaDeviceEnablePeerAccess, cudaDeviceDisablePeerAccess
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## Build and Run
|
## Build and Run
|
||||||
|
|
||||||
|
|