Add and update samples with CUDA 10.1 Update 1 support

This commit is contained in:
Mahesh Doijade 2019-04-10 20:12:09 +05:30
parent 1abc294982
commit 337815dbee
210 changed files with 46770 additions and 1057 deletions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

166
Common/helper_cusolver.h Normal file
View File

@ -0,0 +1,166 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef HELPER_CUSOLVER
#define HELPER_CUSOLVER
#include <ctype.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "cusparse.h"
#define SWITCH_CHAR '-'
struct testOpts {
char *sparse_mat_filename; // by switch -F<filename>
const char *testFunc; // by switch -R<name>
const char *reorder; // by switch -P<name>
int lda; // by switch -lda<int>
};
double vec_norminf(int n, const double *x) {
double norminf = 0;
for (int j = 0; j < n; j++) {
double x_abs = fabs(x[j]);
norminf = (norminf > x_abs) ? norminf : x_abs;
}
return norminf;
}
/*
* |A| = max { |A|*ones(m,1) }
*/
double mat_norminf(int m, int n, const double *A, int lda) {
double norminf = 0;
for (int i = 0; i < m; i++) {
double sum = 0.0;
for (int j = 0; j < n; j++) {
double A_abs = fabs(A[i + j * lda]);
sum += A_abs;
}
norminf = (norminf > sum) ? norminf : sum;
}
return norminf;
}
/*
* |A| = max { |A|*ones(m,1) }
*/
double csr_mat_norminf(int m, int n, int nnzA, const cusparseMatDescr_t descrA,
const double *csrValA, const int *csrRowPtrA,
const int *csrColIndA) {
const int baseA =
(CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0;
double norminf = 0;
for (int i = 0; i < m; i++) {
double sum = 0.0;
const int start = csrRowPtrA[i] - baseA;
const int end = csrRowPtrA[i + 1] - baseA;
for (int colidx = start; colidx < end; colidx++) {
// const int j = csrColIndA[colidx] - baseA;
double A_abs = fabs(csrValA[colidx]);
sum += A_abs;
}
norminf = (norminf > sum) ? norminf : sum;
}
return norminf;
}
void display_matrix(int m, int n, int nnzA, const cusparseMatDescr_t descrA,
const double *csrValA, const int *csrRowPtrA,
const int *csrColIndA) {
const int baseA =
(CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0;
printf("m = %d, n = %d, nnz = %d, matlab base-1\n", m, n, nnzA);
for (int row = 0; row < m; row++) {
const int start = csrRowPtrA[row] - baseA;
const int end = csrRowPtrA[row + 1] - baseA;
for (int colidx = start; colidx < end; colidx++) {
const int col = csrColIndA[colidx] - baseA;
double Areg = csrValA[colidx];
printf("A(%d, %d) = %20.16E\n", row + 1, col + 1, Areg);
}
}
}
#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second(void) {
LARGE_INTEGER t;
static double oofreq;
static int checkedForHighResTimer;
static BOOL hasHighResTimer;
if (!checkedForHighResTimer) {
hasHighResTimer = QueryPerformanceFrequency(&t);
oofreq = 1.0 / (double)t.QuadPart;
checkedForHighResTimer = 1;
}
if (hasHighResTimer) {
QueryPerformanceCounter(&t);
return (double)t.QuadPart * oofreq;
} else {
return (double)GetTickCount() / 1000.0;
}
}
#elif defined(__linux__) || defined(__QNX__)
#include <stddef.h>
#include <sys/resource.h>
#include <sys/time.h>
double second(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}
#elif defined(__APPLE__)
#include <stddef.h>
#include <sys/resource.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/types.h>
double second(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}
#else
#error unsupported platform
#endif
#endif

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions modification, are permitted provided that the following conditions

View File

@ -1,6 +1,6 @@
############################################################################### ###############################################################################
# #
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# #
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions # modification, are permitted provided that the following conditions

View File

@ -6,6 +6,12 @@ Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This ve
This section describes the release notes for the CUDA Samples on GitHub only. This section describes the release notes for the CUDA Samples on GitHub only.
### CUDA 10.1 Update 1
* Added `NV12toBGRandResize`. Demonstrates how to convert and resize NV12 frames to BGR planars frames using CUDA in batch.
* Added `EGLStream_CUDA_Interop`. Demonstrates data exchange between CUDA and EGL Streams.
* Added `cuSolverDn_LinearSolver`. Demonstrates cuSolverDN's LU, QR and Cholesky factorization.
* Added support of Visual Studio 2019 to all samples supported on [Windows](#windows-1).
### CUDA 10.1 ### CUDA 10.1
* Added `immaTensorCoreGemm`. Demonstrates integer GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API for integers employing the Tensor Cores. * Added `immaTensorCoreGemm`. Demonstrates integer GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API for integers employing the Tensor Cores.
* Added `simpleIPC`. Demonstrates Inter Process Communication with one process per GPU for computation. * Added `simpleIPC`. Demonstrates Inter Process Communication with one process per GPU for computation.
@ -128,29 +134,32 @@ The samples makefiles can take advantage of certain options:
### Samples by OS ### Samples by OS
#### Linux #### Linux
**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[shfl_scan](./Samples/shfl_scan)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** |
---|---|---|---| ---|---|---|---|
**[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[nvJPEG](./Samples/nvJPEG)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[nvJPEG](./Samples/nvJPEG)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** |
**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** |
**[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** |
**[bandwidthTest](./Samples/bandwidthTest)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[reduction](./Samples/reduction)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** |
**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** |
**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** |
#### Windows #### Windows
**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[shfl_scan](./Samples/shfl_scan)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[shfl_scan](./Samples/shfl_scan)** |
---|---|---|---| ---|---|---|---|
**[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[simpleD3D12](./Samples/simpleD3D12)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[simpleD3D12](./Samples/simpleD3D12)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** |
**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** |
**[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** |
**[bandwidthTest](./Samples/bandwidthTest)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** |
**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** |
**[matrixMul](./Samples/matrixMul)** |
#### Mac OSX #### Mac OSX
**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** |
---|---|---|---| ---|---|---|---|
**[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** |
**[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** |
**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** |
**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** |
## Dependencies ## Dependencies

View File

@ -0,0 +1,364 @@
################################################################################
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
endif
ifeq ($(TARGET_OS),qnx)
CCFLAGS += -DWIN_INTERFACE_CUSTOM
LDFLAGS += -lsocket
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
SAMPLE_ENABLED := 1
# This sample is not supported on Mac OSX
ifeq ($(TARGET_OS),darwin)
$(info >>> WARNING - EGLStream_CUDA_Interop is not supported on Mac OSX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
# This sample is not supported on ARMv7
ifeq ($(TARGET_ARCH),armv7l)
$(info >>> WARNING - EGLStream_CUDA_Interop is not supported on ARMv7 - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
# This sample is not supported on android
ifeq ($(TARGET_OS),android)
$(info >>> WARNING - EGLStream_CUDA_Interop is not supported on android - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../Common
LIBRARIES :=
################################################################################
# Makefile include to help find EGL Libraries
include ./findegl.mk
# EGL specific libraries
ifneq ($(TARGET_OS),darwin)
LIBRARIES += -lEGL
endif
ifeq ($(TARGET_OS),darwin)
ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
ifeq ($(TARGET_ARCH),x86_64)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
endif
ifeq ($(TARGET_ARCH),ppc64le)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
endif
CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
ifeq ("$(CUDALIB)","")
$(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<)
SAMPLE_ENABLED := 0
else
CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
LIBRARIES += -L$(CUDALIB) -lcuda
endif
endif
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: EGLStream_CUDA_Interop
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
cuda_consumer.o:cuda_consumer.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
cuda_producer.o:cuda_producer.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
eglstrm_common.o:eglstrm_common.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
main.o:main.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
EGLStream_CUDA_Interop: cuda_consumer.o cuda_producer.o eglstrm_common.o main.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./EGLStream_CUDA_Interop
clean:
rm -f EGLStream_CUDA_Interop cuda_consumer.o cuda_producer.o eglstrm_common.o main.o
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/EGLStream_CUDA_Interop
clobber: clean

View File

@ -0,0 +1,76 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>EGLStream_CUDA_Interop</name>
<cuda_api_list>
<driver>cuDeviceGet</driver>
<driver>cuDeviceGetAttribute</driver>
<driver>cuDeviceComputeCapability</driver>
<driver>cuDeviceGetCount</driver>
<driver>cuDeviceGetName</driver>
<driver>cuGraphicsResourceGetMappedEglFrame</driver>
<driver>cuEGLStreamConsumerAcquireFrame</driver>
<driver>cuEGLStreamConsumerReleaseFrame</driver>
<driver>cuEGLStreamProducerPresentFrame</driver>
<driver>cuCtxCreate</driver>
<driver>cuMemAlloc</driver>
<driver>cuMemFree</driver>
<driver>cuMemcpy3D</driver>
<driver>cuStreamCreate</driver>
<driver>cuCtxPushCurrent</driver>
<driver>cuCtxPopCurrent</driver>
<driver>cuCtxDestroy</driver>
</cuda_api_list>
<description><![CDATA[Demonstrates data exchange between CUDA and EGL Streams.]]></description>
<devicecompilation>whole</devicecompilation>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../common/inc</path>
</includepaths>
<keyconcepts>
<concept level="basic">EGLStreams Interop</concept>
</keyconcepts>
<keywords>
<keyword>EGL Streams</keyword>
</keywords>
<libraries>
<library os="linux">cuda</library>
<library framework="true" os="macosx">CUDA</library>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>main.cpp</primary_file>
<required_dependencies>
<dependency>EGL</dependency>
</required_dependencies>
<scopes>
<scope>1:CUDA Basic Topics</scope>
<scope>2:Graphics Interop</scope>
</scopes>
<sm-arch>sm30</sm-arch>
<sm-arch>sm35</sm-arch>
<sm-arch>sm37</sm-arch>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<arch>aarch64</arch>
</env>
</supported_envs>
<supported_sm_architectures>
<include>all</include>
</supported_sm_architectures>
<title>EGLStream CUDA Interop</title>
<type>exe</type>
</entry>

View File

@ -0,0 +1,64 @@
# EGLStream_CUDA_Interop - EGLStream CUDA Interop
## Description
Demonstrates data exchange between CUDA and EGL Streams.
## Key Concepts
EGLStreams Interop
## Supported SM Architectures
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
Linux
## Supported CPU Architecture
x86_64, aarch64
## CUDA APIs involved
### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html)
cuDeviceGet, cuDeviceGetAttribute, cuDeviceComputeCapability, cuDeviceGetCount, cuDeviceGetName, cuGraphicsResourceGetMappedEglFrame, cuEGLStreamConsumerAcquireFrame, cuEGLStreamConsumerReleaseFrame, cuEGLStreamProducerPresentFrame, cuCtxCreate, cuMemAlloc, cuMemFree, cuMemcpy3D, cuStreamCreate, cuCtxPushCurrent, cuCtxPopCurrent, cuCtxDestroy
## Dependencies needed to build/run
[EGL](../../README.md#egl)
## Prerequisites
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, aarch64.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=aarch64` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
## References (for more details)

View File

@ -0,0 +1,318 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// DESCRIPTION: Simple CUDA consumer rendering sample app
//
#include "cuda_consumer.h"
#include <helper_cuda_drvapi.h>
#include "eglstrm_common.h"
#if defined(EXTENSION_LIST)
EXTENSION_LIST(EXTLST_EXTERN)
#endif
int checkbuf(FILE *fp1, FILE *fp2);
CUresult cudaConsumerTest(test_cuda_consumer_s *data, char *fileName) {
CUresult cuStatus = CUDA_SUCCESS;
CUarray cudaArr = NULL;
CUeglFrame cudaEgl;
CUgraphicsResource cudaResource;
unsigned int i;
int check_result;
FILE *pInFile1 = NULL, *pInFile2 = NULL, *file_p = NULL;
EGLint streamState = 0;
if (!data) {
printf("%s: Bad parameter\n", __func__);
goto done;
}
if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR,
&streamState)) {
printf("Cuda consumer, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
}
if (streamState == EGL_STREAM_STATE_DISCONNECTED_KHR) {
printf("CUDA Consumer: - EGL_STREAM_STATE_DISCONNECTED_KHR received\n");
}
if (streamState == EGL_STREAM_STATE_NEW_FRAME_AVAILABLE_KHR) {
cuStatus = cuEGLStreamConsumerAcquireFrame(&(data->cudaConn), &cudaResource,
NULL, 16000);
if (cuStatus == CUDA_SUCCESS) {
CUdeviceptr pDevPtr = 0;
int bufferSize;
unsigned char *pCudaCopyMem = NULL;
unsigned int copyWidthInBytes = 0, copyHeight = 0;
file_p = fopen(fileName, "wb+");
if (!file_p) {
printf("WriteFrame: file open failed %s\n", fileName);
cuStatus = CUDA_ERROR_UNKNOWN;
goto done;
}
cuStatus =
cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0);
if (cuStatus != CUDA_SUCCESS) {
printf("Cuda get resource failed with %d\n", cuStatus);
goto done;
}
cuStatus = cuCtxSynchronize();
if (cuStatus != CUDA_SUCCESS) {
printf("cuCtxSynchronize failed \n");
goto done;
}
if (!(cudaEgl.planeCount >= 1 && cudaEgl.planeCount <= 3)) {
printf("Plane count is invalid\nExiting\n");
goto done;
}
for (i = 0; i < cudaEgl.planeCount; i++) {
if (cudaEgl.frameType == CU_EGL_FRAME_TYPE_PITCH) {
pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[i];
if (cudaEgl.planeCount == 1) {
bufferSize = cudaEgl.pitch * cudaEgl.height;
copyWidthInBytes = cudaEgl.pitch;
copyHeight = data->height;
} else if (i == 1 && cudaEgl.planeCount == 2) { // YUV 420
// semi-planar
bufferSize = cudaEgl.pitch * cudaEgl.height / 2;
copyWidthInBytes = cudaEgl.pitch;
copyHeight = data->height / 2;
} else {
bufferSize = data->width * data->height;
copyWidthInBytes = data->width;
copyHeight = data->height;
if (i > 0) {
bufferSize >>= 2;
copyWidthInBytes >>= 1;
copyHeight >>= 1;
}
}
} else {
cudaArr = cudaEgl.frame.pArray[i];
if (cudaEgl.planeCount == 1) {
bufferSize = data->width * data->height * 4;
copyWidthInBytes = data->width * 4;
copyHeight = data->height;
} else if (i == 1 && cudaEgl.planeCount == 2) { // YUV 420
// semi-planar
bufferSize = data->width * data->height / 2;
copyWidthInBytes = data->width;
copyHeight = data->height / 2;
} else {
bufferSize = data->width * data->height;
copyWidthInBytes = data->width;
copyHeight = data->height;
if (i > 0) {
bufferSize >>= 2;
copyWidthInBytes >>= 1;
copyHeight >>= 1;
}
}
}
if (i == 0) {
pCudaCopyMem = (unsigned char *)malloc(bufferSize);
if (pCudaCopyMem == NULL) {
printf("pCudaCopyMem malloc failed\n");
goto done;
}
}
memset(pCudaCopyMem, 0, bufferSize);
if (data->pitchLinearOutput) {
cuStatus = cuMemcpyDtoH(pCudaCopyMem, pDevPtr, bufferSize);
if (cuStatus != CUDA_SUCCESS) {
printf(
"cuda_consumer: pitch linear Memcpy failed, bufferSize =%d\n",
bufferSize);
goto done;
}
cuStatus = cuCtxSynchronize();
if (cuStatus != CUDA_SUCCESS) {
printf("cuda_consumer: cuCtxSynchronize failed after memcpy \n");
goto done;
}
} else {
CUDA_MEMCPY3D cpdesc;
memset(&cpdesc, 0, sizeof(cpdesc));
cpdesc.srcXInBytes = cpdesc.srcY = cpdesc.srcZ = cpdesc.srcLOD = 0;
cpdesc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
cpdesc.srcArray = cudaArr;
cpdesc.dstXInBytes = cpdesc.dstY = cpdesc.dstZ = cpdesc.dstLOD = 0;
cpdesc.dstMemoryType = CU_MEMORYTYPE_HOST;
cpdesc.dstHost = (void *)pCudaCopyMem;
cpdesc.WidthInBytes = copyWidthInBytes; // data->width * 4;
cpdesc.Height = copyHeight; // data->height;
cpdesc.Depth = 1;
cuStatus = cuMemcpy3D(&cpdesc);
if (cuStatus != CUDA_SUCCESS) {
printf(
"Cuda consumer: cuMemCpy3D failed, copyWidthInBytes=%d, "
"copyHight=%d\n",
copyWidthInBytes, copyHeight);
}
cuStatus = cuCtxSynchronize();
if (cuStatus != CUDA_SUCCESS) {
printf("cuCtxSynchronize failed after memcpy \n");
}
}
if (cuStatus == CUDA_SUCCESS) {
if (fwrite(pCudaCopyMem, bufferSize, 1, file_p) != 1) {
printf("Cuda consumer: output file write failed\n");
cuStatus = CUDA_ERROR_UNKNOWN;
goto done;
}
}
}
pInFile1 = fopen(data->fileName1, "rb");
if (!pInFile1) {
printf("Failed to open file :%s\n", data->fileName1);
goto done;
}
pInFile2 = fopen(data->fileName2, "rb");
if (!pInFile2) {
printf("Failed to open file :%s\n", data->fileName2);
goto done;
}
rewind(file_p);
check_result = checkbuf(file_p, pInFile1);
if (check_result == -1) {
rewind(file_p);
check_result = checkbuf(file_p, pInFile2);
if (check_result == -1) {
printf("Frame received does not match any valid image: FAILED\n");
} else {
printf("Frame check Passed\n");
}
} else {
printf("Frame check Passed\n");
}
if (pCudaCopyMem) {
free(pCudaCopyMem);
pCudaCopyMem = NULL;
}
cuStatus =
cuEGLStreamConsumerReleaseFrame(&data->cudaConn, cudaResource, NULL);
if (cuStatus != CUDA_SUCCESS) {
printf("cuEGLStreamConsumerReleaseFrame failed with cuStatus = %d\n",
cuStatus);
goto done;
}
} else {
printf("cuda AcquireFrame FAILED with cuStatus=%d\n", cuStatus);
goto done;
}
}
done:
if (file_p) {
fclose(file_p);
file_p = NULL;
}
if (pInFile1) {
fclose(pInFile1);
pInFile1 = NULL;
}
if (pInFile1) {
fclose(pInFile2);
pInFile2 = NULL;
}
return cuStatus;
}
int checkbuf(FILE *fp1, FILE *fp2) {
int match = 0;
int ch1, ch2;
if (fp1 == NULL) {
printf("Invalid file pointer for first file\n");
return -1;
} else if (fp2 == NULL) {
printf("Invalid file pointer for second file\n");
return -1;
} else {
ch1 = getc(fp1);
ch2 = getc(fp2);
while ((ch1 != EOF) && (ch2 != EOF) && (ch1 == ch2)) {
ch1 = getc(fp1);
ch2 = getc(fp2);
}
if (ch1 == ch2) {
match = 1;
} else if (ch1 != ch2) {
match = -1;
}
}
return match;
}
CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer,
CUdevice device) {
CUresult status = CUDA_SUCCESS;
if (CUDA_SUCCESS != (status = cuInit(0))) {
printf("Failed to initialize CUDA\n");
return status;
}
int major = 0, minor = 0;
char deviceName[256];
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
checkCudaErrors(cuDeviceGetName(deviceName, 256, device));
printf(
"CUDA Consumer on GPU Device %d: \"%s\" with compute capability "
"%d.%d\n\n",
device, deviceName, major, minor);
if (CUDA_SUCCESS !=
(status = cuCtxCreate(&cudaConsumer->context, 0, device))) {
printf("failed to create CUDA context\n");
return status;
}
checkCudaErrors(cuCtxPopCurrent(&cudaConsumer->context));
return status;
}
void cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args) {
cudaConsumer->pitchLinearOutput = args->pitchLinearOutput;
cudaConsumer->width = args->inputWidth;
cudaConsumer->height = args->inputHeight;
cudaConsumer->fileName1 = args->infile1;
cudaConsumer->fileName2 = args->infile2;
cudaConsumer->outFile1 = "cuda_out1.yuv";
cudaConsumer->outFile2 = "cuda_out2.yuv";
}
CUresult cuda_consumer_deinit(test_cuda_consumer_s *cudaConsumer) {
return cuEGLStreamConsumerDisconnect(&cudaConsumer->cudaConn);
}

View File

@ -0,0 +1,62 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// DESCRIPTION: CUDA consumer header file
//
#ifndef _CUDA_CONSUMER_H_
#define _CUDA_CONSUMER_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "cudaEGL.h"
#include "eglstrm_common.h"
extern EGLStreamKHR eglStream;
extern EGLDisplay g_display;
typedef struct _test_cuda_consumer_s {
CUcontext context;
CUeglStreamConnection cudaConn;
bool pitchLinearOutput;
unsigned int width;
unsigned int height;
char *fileName1;
char *fileName2;
char *outFile1;
char *outFile2;
unsigned int frameCount;
} test_cuda_consumer_s;
void cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args);
CUresult cuda_consumer_deinit(test_cuda_consumer_s *cudaConsumer);
CUresult cudaConsumerTest(test_cuda_consumer_s *data, char *outFile);
CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer,
CUdevice device);
#endif

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,381 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// DESCRIPTION: Simple cuda EGL stream producer app
//
#include "cuda_producer.h"
#include <helper_cuda_drvapi.h>
#include "cudaEGL.h"
#include "eglstrm_common.h"
#if defined(EXTENSION_LIST)
EXTENSION_LIST(EXTLST_EXTERN)
#endif
static CUresult cudaProducerReadYUVFrame(FILE *file, unsigned int frameNum,
unsigned int width,
unsigned int height,
unsigned char *pBuff) {
int bOrderUV = 0;
unsigned char *pYBuff, *pUBuff, *pVBuff, *pChroma;
unsigned int frameSize = (width * height * 3) / 2;
CUresult ret = CUDA_SUCCESS;
unsigned int i;
if (!pBuff || !file) return CUDA_ERROR_FILE_NOT_FOUND;
pYBuff = pBuff;
// YVU order in the buffer
pVBuff = pYBuff + width * height;
pUBuff = pVBuff + width * height / 4;
if (fseek(file, frameNum * frameSize, SEEK_SET)) {
printf("ReadYUVFrame: Error seeking file: %p\n", file);
ret = CUDA_ERROR_NOT_PERMITTED;
goto done;
}
// read Y U V separately
for (i = 0; i < height; i++) {
if (fread(pYBuff, width, 1, file) != 1) {
printf("ReadYUVFrame: Error reading file: %p\n", file);
ret = CUDA_ERROR_NOT_PERMITTED;
goto done;
}
pYBuff += width;
}
pChroma = bOrderUV ? pUBuff : pVBuff;
for (i = 0; i < height / 2; i++) {
if (fread(pChroma, width / 2, 1, file) != 1) {
printf("ReadYUVFrame: Error reading file: %p\n", file);
ret = CUDA_ERROR_NOT_PERMITTED;
goto done;
}
pChroma += width / 2;
}
pChroma = bOrderUV ? pVBuff : pUBuff;
for (i = 0; i < height / 2; i++) {
if (fread(pChroma, width / 2, 1, file) != 1) {
printf("ReadYUVFrame: Error reading file: %p\n", file);
ret = CUDA_ERROR_NOT_PERMITTED;
goto done;
}
pChroma += width / 2;
}
done:
return ret;
}
static CUresult cudaProducerReadARGBFrame(FILE *file, unsigned int frameNum,
unsigned int width,
unsigned int height,
unsigned char *pBuff) {
unsigned int frameSize = width * height * 4;
CUresult ret = CUDA_SUCCESS;
if (!pBuff || !file) return CUDA_ERROR_FILE_NOT_FOUND;
if (fseek(file, frameNum * frameSize, SEEK_SET)) {
printf("ReadYUVFrame: Error seeking file: %p\n", file);
ret = CUDA_ERROR_NOT_PERMITTED;
goto done;
}
// read ARGB data
if (fread(pBuff, frameSize, 1, file) != 1) {
if (feof(file))
printf("ReadARGBFrame: file read to the end\n");
else
printf("ReadARGBFrame: Error reading file: %p\n", file);
ret = CUDA_ERROR_NOT_PERMITTED;
goto done;
}
done:
return ret;
}
CUresult cudaProducerTest(test_cuda_producer_s *cudaProducer, char *file) {
int framenum = 0;
CUarray cudaArr[3] = {0};
CUdeviceptr cudaPtr[3] = {0, 0, 0};
unsigned int bufferSize;
CUresult cuStatus = CUDA_SUCCESS;
unsigned int i, surfNum, uvOffset[3] = {0};
unsigned int copyWidthInBytes[3] = {0, 0, 0}, copyHeight[3] = {0, 0, 0};
CUeglColorFormat eglColorFormat;
FILE *file_p;
CUeglFrame cudaEgl;
CUcontext oldContext;
file_p = fopen(file, "rb");
if (!file_p) {
printf("CudaProducer: Error opening file: %s\n", file);
goto done;
}
if (cudaProducer->pitchLinearOutput) {
if (cudaProducer->isARGB) {
cudaPtr[0] = cudaProducer->cudaPtrARGB[0];
} else { // YUV case
for (i = 0; i < 3; i++) {
if (i == 0) {
bufferSize = cudaProducer->width * cudaProducer->height;
} else {
bufferSize = cudaProducer->width * cudaProducer->height / 4;
}
cudaPtr[i] = cudaProducer->cudaPtrYUV[i];
}
}
} else {
if (cudaProducer->isARGB) {
cudaArr[0] = cudaProducer->cudaArrARGB[0];
} else {
for (i = 0; i < 3; i++) {
cudaArr[i] = cudaProducer->cudaArrYUV[i];
}
}
}
uvOffset[0] = 0;
if (cudaProducer->isARGB) {
if (CUDA_SUCCESS !=
cudaProducerReadARGBFrame(file_p, framenum, cudaProducer->width,
cudaProducer->height, cudaProducer->pBuff)) {
printf("cuda producer, read ARGB frame failed\n");
goto done;
}
copyWidthInBytes[0] = cudaProducer->width * 4;
copyHeight[0] = cudaProducer->height;
surfNum = 1;
eglColorFormat = CU_EGL_COLOR_FORMAT_ARGB;
} else {
if (CUDA_SUCCESS !=
cudaProducerReadYUVFrame(file_p, framenum, cudaProducer->width,
cudaProducer->height, cudaProducer->pBuff)) {
printf("cuda producer, reading YUV frame failed\n");
goto done;
}
surfNum = 3;
eglColorFormat = CU_EGL_COLOR_FORMAT_YUV420_PLANAR;
copyWidthInBytes[0] = cudaProducer->width;
copyHeight[0] = cudaProducer->height;
copyWidthInBytes[1] = cudaProducer->width / 2;
copyHeight[1] = cudaProducer->height / 2;
copyWidthInBytes[2] = cudaProducer->width / 2;
copyHeight[2] = cudaProducer->height / 2;
uvOffset[1] = cudaProducer->width * cudaProducer->height;
uvOffset[2] =
uvOffset[1] + cudaProducer->width / 2 * cudaProducer->height / 2;
}
if (cudaProducer->pitchLinearOutput) {
for (i = 0; i < surfNum; i++) {
cuStatus =
cuMemcpy(cudaPtr[i], (CUdeviceptr)(cudaProducer->pBuff + uvOffset[i]),
copyWidthInBytes[i] * copyHeight[i]);
if (cuStatus != CUDA_SUCCESS) {
printf("Cuda producer: cuMemCpy pitchlinear failed, cuStatus =%d\n",
cuStatus);
goto done;
}
}
} else {
// copy cudaProducer->pBuff to cudaArray
CUDA_MEMCPY3D cpdesc;
for (i = 0; i < surfNum; i++) {
memset(&cpdesc, 0, sizeof(cpdesc));
cpdesc.srcXInBytes = cpdesc.srcY = cpdesc.srcZ = cpdesc.srcLOD = 0;
cpdesc.srcMemoryType = CU_MEMORYTYPE_HOST;
cpdesc.srcHost = (void *)(cudaProducer->pBuff + uvOffset[i]);
cpdesc.dstXInBytes = cpdesc.dstY = cpdesc.dstZ = cpdesc.dstLOD = 0;
cpdesc.dstMemoryType = CU_MEMORYTYPE_ARRAY;
cpdesc.dstArray = cudaArr[i];
cpdesc.WidthInBytes = copyWidthInBytes[i];
cpdesc.Height = copyHeight[i];
cpdesc.Depth = 1;
cuStatus = cuMemcpy3D(&cpdesc);
if (cuStatus != CUDA_SUCCESS) {
printf("Cuda producer: cuMemCpy failed, cuStatus =%d\n", cuStatus);
goto done;
}
}
}
for (i = 0; i < surfNum; i++) {
if (cudaProducer->pitchLinearOutput)
cudaEgl.frame.pPitch[i] = (void *)cudaPtr[i];
else
cudaEgl.frame.pArray[i] = cudaArr[i];
}
cudaEgl.width = copyWidthInBytes[0];
cudaEgl.depth = 1;
cudaEgl.height = copyHeight[0];
cudaEgl.pitch = cudaProducer->pitchLinearOutput ? cudaEgl.width : 0;
cudaEgl.frameType = cudaProducer->pitchLinearOutput ? CU_EGL_FRAME_TYPE_PITCH
: CU_EGL_FRAME_TYPE_ARRAY;
cudaEgl.planeCount = surfNum;
cudaEgl.numChannels = (eglColorFormat == CU_EGL_COLOR_FORMAT_ARGB) ? 4 : 1;
cudaEgl.eglColorFormat = eglColorFormat;
cudaEgl.cuFormat = CU_AD_FORMAT_UNSIGNED_INT8;
cuStatus =
cuEGLStreamProducerPresentFrame(&cudaProducer->cudaConn, cudaEgl, NULL);
if (cuStatus != CUDA_SUCCESS) {
printf("cuda Producer present frame FAILED with custatus= %d\n", cuStatus);
goto done;
}
done:
if (file_p) {
fclose(file_p);
file_p = NULL;
}
return cuStatus;
}
CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer,
CUdevice device) {
CUresult status = CUDA_SUCCESS;
if (CUDA_SUCCESS != (status = cuInit(0))) {
printf("Failed to initialize CUDA\n");
return status;
}
int major = 0, minor = 0;
char deviceName[256];
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
checkCudaErrors(cuDeviceGetName(deviceName, 256, device));
printf(
"CUDA Producer on GPU Device %d: \"%s\" with compute capability "
"%d.%d\n\n",
device, deviceName, major, minor);
if (CUDA_SUCCESS !=
(status = cuCtxCreate(&cudaProducer->context, 0, device))) {
printf("failed to create CUDA context\n");
return status;
}
status = cuMemAlloc(&cudaProducer->cudaPtrARGB[0], (WIDTH * HEIGHT * 4));
if (status != CUDA_SUCCESS) {
printf("Create CUDA pointer failed, cuStatus=%d\n", status);
return status;
}
status = cuMemAlloc(&cudaProducer->cudaPtrYUV[0], (WIDTH * HEIGHT));
if (status != CUDA_SUCCESS) {
printf("Create CUDA pointer failed, cuStatus=%d\n", status);
return status;
}
status = cuMemAlloc(&cudaProducer->cudaPtrYUV[1], (WIDTH * HEIGHT) / 4);
if (status != CUDA_SUCCESS) {
printf("Create CUDA pointer failed, cuStatus=%d\n", status);
return status;
}
status = cuMemAlloc(&cudaProducer->cudaPtrYUV[2], (WIDTH * HEIGHT) / 4);
if (status != CUDA_SUCCESS) {
printf("Create CUDA pointer failed, cuStatus=%d\n", status);
return status;
}
CUDA_ARRAY3D_DESCRIPTOR desc = {0};
desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
desc.Depth = 1;
desc.Flags = CUDA_ARRAY3D_SURFACE_LDST;
desc.NumChannels = 4;
desc.Width = WIDTH * 4;
desc.Height = HEIGHT;
status = cuArray3DCreate(&cudaProducer->cudaArrARGB[0], &desc);
if (status != CUDA_SUCCESS) {
printf("Create CUDA array failed, cuStatus=%d\n", status);
return status;
}
for (int i = 0; i < 3; i++) {
if (i == 0) {
desc.NumChannels = 1;
desc.Width = WIDTH;
desc.Height = HEIGHT;
} else { // U/V surface as planar
desc.NumChannels = 1;
desc.Width = WIDTH / 2;
desc.Height = HEIGHT / 2;
}
status = cuArray3DCreate(&cudaProducer->cudaArrYUV[i], &desc);
if (status != CUDA_SUCCESS) {
printf("Create CUDA array failed, cuStatus=%d\n", status);
return status;
}
}
cudaProducer->pBuff = (unsigned char *)malloc((WIDTH * HEIGHT * 4));
if (!cudaProducer->pBuff) {
printf("CudaProducer: Failed to allocate image buffer\n");
}
checkCudaErrors(cuCtxPopCurrent(&cudaProducer->context));
return status;
}
void cudaProducerInit(test_cuda_producer_s *cudaProducer, EGLDisplay eglDisplay,
EGLStreamKHR eglStream, TestArgs *args) {
cudaProducer->fileName1 = args->infile1;
cudaProducer->fileName2 = args->infile2;
cudaProducer->frameCount = 2;
cudaProducer->width = args->inputWidth;
cudaProducer->height = args->inputHeight;
cudaProducer->isARGB = args->isARGB;
cudaProducer->pitchLinearOutput = args->pitchLinearOutput;
// Set cudaProducer default parameters
cudaProducer->eglDisplay = eglDisplay;
cudaProducer->eglStream = eglStream;
}
CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer) {
if (cudaProducer->pBuff) free(cudaProducer->pBuff);
checkCudaErrors(cuMemFree(cudaProducer->cudaPtrARGB[0]));
checkCudaErrors(cuMemFree(cudaProducer->cudaPtrYUV[0]));
checkCudaErrors(cuMemFree(cudaProducer->cudaPtrYUV[1]));
checkCudaErrors(cuMemFree(cudaProducer->cudaPtrYUV[2]));
checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrARGB[0]));
checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrYUV[0]));
checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrYUV[1]));
checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrYUV[2]));
return cuEGLStreamProducerDisconnect(&cudaProducer->cudaConn);
}

View File

@ -0,0 +1,68 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// DESCRIPTION: Simple cuda producer header file
//
#ifndef _CUDA_PRODUCER_H_
#define _CUDA_PRODUCER_H_
#include <EGL/egl.h>
#include <EGL/eglext.h>
#include "cudaEGL.h"
#include "eglstrm_common.h"
extern EGLStreamKHR eglStream;
extern EGLDisplay g_display;
typedef struct _test_cuda_producer_s {
// Stream params
char *fileName1;
char *fileName2;
unsigned char *pBuff;
int frameCount;
bool isARGB;
bool pitchLinearOutput;
unsigned int width;
unsigned int height;
CUcontext context;
CUeglStreamConnection cudaConn;
CUdeviceptr cudaPtrARGB[1];
CUdeviceptr cudaPtrYUV[3];
CUarray cudaArrARGB[1];
CUarray cudaArrYUV[3];
EGLStreamKHR eglStream;
EGLDisplay eglDisplay;
} test_cuda_producer_s;
void cudaProducerInit(test_cuda_producer_s *cudaProducer, EGLDisplay eglDisplay,
EGLStreamKHR eglStream, TestArgs *args);
CUresult cudaProducerTest(test_cuda_producer_s *parserArg, char *file);
CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer);
CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer,
CUdevice device);
#endif

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,139 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// DESCRIPTION: Common egl stream functions
//
#include "eglstrm_common.h"
EGLStreamKHR eglStream;
EGLDisplay g_display;
EGLAttrib cudaIndex;
#if defined(EXTENSION_LIST)
EXTENSION_LIST(EXTLST_DECL)
typedef void (*extlst_fnptr_t)(void);
static struct {
extlst_fnptr_t *fnptr;
char const *name;
} extensionList[] = {EXTENSION_LIST(EXTLST_ENTRY)};
int eglSetupExtensions(void) {
unsigned int i;
for (i = 0; i < (sizeof(extensionList) / sizeof(*extensionList)); i++) {
*extensionList[i].fnptr = eglGetProcAddress(extensionList[i].name);
if (*extensionList[i].fnptr == NULL) {
printf("Couldn't get address of %s()\n", extensionList[i].name);
return 0;
}
}
return 1;
}
int EGLStreamInit(int *cuda_device) {
static const EGLint streamAttrMailboxMode[] = {EGL_SUPPORT_REUSE_NV,
EGL_FALSE, EGL_NONE};
EGLBoolean eglStatus;
#define MAX_EGL_DEVICES 4
EGLint numDevices = 0;
EGLDeviceEXT devices[MAX_EGL_DEVICES];
eglStatus = eglQueryDevicesEXT(MAX_EGL_DEVICES, devices, &numDevices);
if (eglStatus != EGL_TRUE) {
printf("Error querying EGL devices\n");
exit(EXIT_FAILURE);
}
if (numDevices == 0) {
printf("No EGL devices found.. Waiving\n");
eglStatus = EGL_FALSE;
exit(EXIT_WAIVED);
}
int egl_device_id = 0;
for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) {
eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id],
EGL_CUDA_DEVICE_NV, &cudaIndex);
if (eglStatus == EGL_TRUE) {
*cuda_device = cudaIndex; // We select first EGL-CUDA Capable device.
printf("Found EGL-CUDA Capable device with CUDA Device id = %d\n",
(int)cudaIndex);
break;
}
}
if (egl_device_id >= numDevices) {
printf("No CUDA Capable EGL Device found.. Waiving execution\n");
exit(EXIT_WAIVED);
}
g_display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT,
(void *)devices[egl_device_id], NULL);
if (g_display == EGL_NO_DISPLAY) {
printf("Could not get EGL display from device. \n");
eglStatus = EGL_FALSE;
exit(EXIT_FAILURE);
}
eglStatus = eglInitialize(g_display, 0, 0);
if (!eglStatus) {
printf("EGL failed to initialize. \n");
eglStatus = EGL_FALSE;
exit(EXIT_FAILURE);
}
eglStream = eglCreateStreamKHR(g_display, streamAttrMailboxMode);
if (eglStream == EGL_NO_STREAM_KHR) {
printf("Could not create EGL stream.\n");
eglStatus = EGL_FALSE;
exit(EXIT_FAILURE);
}
printf("Created EGLStream %p\n", eglStream);
// Set stream attribute
if (!eglStreamAttribKHR(g_display, eglStream, EGL_CONSUMER_LATENCY_USEC_KHR,
16000)) {
printf(
"Consumer: eglStreamAttribKHR EGL_CONSUMER_LATENCY_USEC_KHR failed\n");
return 0;
}
if (!eglStreamAttribKHR(g_display, eglStream,
EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR, 16000)) {
printf(
"Consumer: eglStreamAttribKHR EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR "
"failed\n");
return 0;
}
printf("EGLStream initialized\n");
return 1;
}
void EGLStreamFini(void) { eglDestroyStreamKHR(g_display, eglStream); }
#endif

View File

@ -0,0 +1,103 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// DESCRIPTION: Common EGL stream functions header file
//
#ifndef _EGLSTRM_COMMON_H_
#define _EGLSTRM_COMMON_H_
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <unistd.h>
#include "cuda.h"
#include "cudaEGL.h"
#include "helper_cuda_drvapi.h"
#define EXTENSION_LIST(T) \
T(PFNEGLCREATESTREAMKHRPROC, eglCreateStreamKHR) \
T(PFNEGLDESTROYSTREAMKHRPROC, eglDestroyStreamKHR) \
T(PFNEGLQUERYSTREAMKHRPROC, eglQueryStreamKHR) \
T(PFNEGLQUERYSTREAMU64KHRPROC, eglQueryStreamu64KHR) \
T(PFNEGLQUERYSTREAMTIMEKHRPROC, eglQueryStreamTimeKHR) \
T(PFNEGLSTREAMATTRIBKHRPROC, eglStreamAttribKHR) \
T(PFNEGLSTREAMCONSUMERACQUIREKHRPROC, eglStreamConsumerAcquireKHR) \
T(PFNEGLSTREAMCONSUMERRELEASEKHRPROC, eglStreamConsumerReleaseKHR) \
T(PFNEGLSTREAMCONSUMERGLTEXTUREEXTERNALKHRPROC, \
eglStreamConsumerGLTextureExternalKHR) \
T(PFNEGLGETSTREAMFILEDESCRIPTORKHRPROC, eglGetStreamFileDescriptorKHR) \
T(PFNEGLQUERYDEVICESEXTPROC, eglQueryDevicesEXT) \
T(PFNEGLGETPLATFORMDISPLAYEXTPROC, eglGetPlatformDisplayEXT) \
T(PFNEGLQUERYDEVICEATTRIBEXTPROC, eglQueryDeviceAttribEXT) \
T(PFNEGLCREATESTREAMFROMFILEDESCRIPTORKHRPROC, \
eglCreateStreamFromFileDescriptorKHR)
#define eglCreateStreamKHR my_eglCreateStreamKHR
#define eglDestroyStreamKHR my_eglDestroyStreamKHR
#define eglQueryStreamKHR my_eglQueryStreamKHR
#define eglQueryStreamu64KHR my_eglQueryStreamu64KHR
#define eglQueryStreamTimeKHR my_eglQueryStreamTimeKHR
#define eglStreamAttribKHR my_eglStreamAttribKHR
#define eglStreamConsumerAcquireKHR my_eglStreamConsumerAcquireKHR
#define eglStreamConsumerReleaseKHR my_eglStreamConsumerReleaseKHR
#define eglStreamConsumerGLTextureExternalKHR \
my_eglStreamConsumerGLTextureExternalKHR
#define eglGetStreamFileDescriptorKHR my_eglGetStreamFileDescriptorKHR
#define eglCreateStreamFromFileDescriptorKHR \
my_eglCreateStreamFromFileDescriptorKHR
#define eglQueryDevicesEXT my_eglQueryDevicesEXT
#define eglGetPlatformDisplayEXT my_eglGetPlatformDisplayEXT
#define eglQueryDeviceAttribEXT my_eglQueryDeviceAttribEXT
#define EXTLST_DECL(tx, x) tx my_##x = NULL;
#define EXTLST_EXTERN(tx, x) extern tx my_##x;
#define EXTLST_ENTRY(tx, x) {(extlst_fnptr_t *)&my_##x, #x},
#define MAX_STRING_SIZE 256
#define WIDTH 720
#define HEIGHT 480
typedef struct _TestArgs {
char *infile1;
char *infile2;
bool isARGB;
unsigned int inputWidth;
unsigned int inputHeight;
bool pitchLinearOutput;
} TestArgs;
int eglSetupExtensions(void);
int EGLStreamInit(int *dev);
void EGLStreamFini(void);
#endif

View File

@ -0,0 +1,156 @@
################################################################################
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# findegl.mk is used to find the necessary EGL Libraries for specific distributions
# this is supported on Linux
#
################################################################################
# Determine OS platform and unix distribution
ifeq ("$(TARGET_OS)","linux")
# first search lsb_release
DISTRO = $(shell lsb_release -i -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
ifeq ("$(DISTRO)","")
# second search and parse /etc/issue
DISTRO = $(shell more /etc/issue | awk '{print $$1}' | sed '1!d' | sed -e "/^$$/d" 2>/dev/null | tr "[:upper:]" "[:lower:]")
# ensure data from /etc/issue is valid
ifeq (,$(filter $(DISTRO),ubuntu fedora red rhel centos suse))
DISTRO =
endif
ifeq ("$(DISTRO)","")
# third, we can search in /etc/os-release or /etc/{distro}-release
DISTRO = $(shell awk '/ID/' /etc/*-release | sed 's/ID=//' | grep -v "VERSION" | grep -v "ID" | grep -v "DISTRIB")
endif
endif
endif
ifeq ("$(TARGET_OS)","linux")
# $(info) >> findegl.mk -> LINUX path <<<)
# Each set of Linux Distros have different paths for where to find their OpenGL libraries reside
UBUNTU = $(shell echo $(DISTRO) | grep -i ubuntu >/dev/null 2>&1; echo $$?)
FEDORA = $(shell echo $(DISTRO) | grep -i fedora >/dev/null 2>&1; echo $$?)
RHEL = $(shell echo $(DISTRO) | grep -i 'red\|rhel' >/dev/null 2>&1; echo $$?)
CENTOS = $(shell echo $(DISTRO) | grep -i centos >/dev/null 2>&1; echo $$?)
SUSE = $(shell echo $(DISTRO) | grep -i 'suse\|sles' >/dev/null 2>&1; echo $$?)
ifeq ("$(UBUNTU)","0")
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
GLPATH := /usr/arm-linux-gnueabihf/lib
GLLINK := -L/usr/arm-linux-gnueabihf/lib
ifneq ($(TARGET_FS),)
GLPATH += $(TARGET_FS)/usr/lib/arm-linux-gnueabihf
GLLINK += -L$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-aarch64)
GLPATH := /usr/aarch64-linux-gnu/lib
GLLINK := -L/usr/aarch64-linux-gnu/lib
ifneq ($(TARGET_FS),)
GLPATH += $(TARGET_FS)/usr/lib
GLPATH += $(TARGET_FS)/usr/lib/aarch64-linux-gnu
GLLINK += -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
endif
else
UBUNTU_PKG_NAME = $(shell which dpkg >/dev/null 2>&1 && dpkg -l 'nvidia-*' | grep '^ii' | awk '{print $$2}' | head -1)
ifneq ("$(UBUNTU_PKG_NAME)","")
GLPATH ?= /usr/lib/$(UBUNTU_PKG_NAME)
GLLINK ?= -L/usr/lib/$(UBUNTU_PKG_NAME)
endif
DFLT_PATH ?= /usr/lib
endif
endif
ifeq ("$(SUSE)","0")
GLPATH ?= /usr/X11R6/lib64
GLLINK ?= -L/usr/X11R6/lib64
DFLT_PATH ?= /usr/lib64
endif
ifeq ("$(FEDORA)","0")
GLPATH ?= /usr/lib64/nvidia
GLLINK ?= -L/usr/lib64/nvidia
DFLT_PATH ?= /usr/lib64
endif
ifeq ("$(RHEL)","0")
GLPATH ?= /usr/lib64/nvidia
GLLINK ?= -L/usr/lib64/nvidia
DFLT_PATH ?= /usr/lib64
endif
ifeq ("$(CENTOS)","0")
GLPATH ?= /usr/lib64/nvidia
GLLINK ?= -L/usr/lib64/nvidia
DFLT_PATH ?= /usr/lib64
endif
EGLLIB := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libEGL.so -print 2>/dev/null)
ifeq ("$(EGLLIB)","")
$(info >>> WARNING - libEGL.so not found, please install libEGL.so <<<)
SAMPLE_ENABLED := 0
endif
HEADER_SEARCH_PATH ?= $(TARGET_FS)/usr/include
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
HEADER_SEARCH_PATH += /usr/arm-linux-gnueabihf/include
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-aarch64-linux)
HEADER_SEARCH_PATH += /usr/aarch64-linux-gnu/include
endif
EGLHEADER := $(shell find -L $(HEADER_SEARCH_PATH) -name egl.h -print 2>/dev/null)
EGLEXTHEADER := $(shell find -L $(HEADER_SEARCH_PATH) -name eglext.h -print 2>/dev/null)
ifeq ("$(EGLHEADER)","")
$(info >>> WARNING - egl.h not found, please install egl.h <<<)
SAMPLE_ENABLED := 0
endif
ifeq ("$(EGLEXTHEADER)","")
$(info >>> WARNING - eglext.h not found, please install eglext.h <<<)
SAMPLE_ENABLED := 0
endif
else
endif
# Attempt to compile a minimal EGL application and run to check if EGL_SUPPORT_REUSE_NV is supported in the EGL headers available.
ifneq ($(SAMPLE_ENABLED), 0)
$(shell printf "#include <EGL/egl.h>\n#include <EGL/eglext.h>\nint main() {\n#ifdef EGL_SUPPORT_REUSE_NV \n #error \"Compatible EGL header found\" \n return 0;\n#endif \n return 1;\n}" > test.c; )
EGL_DEFINES := $(shell $(HOST_COMPILER) $(CCFLAGS) $(EXTRA_CCFLAGS) -lEGL test.c -c 2>&1 | grep -ic "Compatible EGL header found";)
SHOULD_WAIVE := 0
ifeq ($(EGL_DEFINES),0)
SHOULD_WAIVE := 1
endif
ifeq ($(SHOULD_WAIVE),1)
$(info -----------------------------------------------------------------------------------------------)
$(info WARNING - NVIDIA EGL EXTENSIONS are not available in the present EGL headers)
$(info -----------------------------------------------------------------------------------------------)
$(info This CUDA Sample cannot be built if the EGL NVIDIA EXTENSIONS like EGL_SUPPORT_REUSE_NV are not supported in EGL headers.)
$(info This will be a dry-run of the Makefile.)
$(info Please install the latest khronos EGL headers and libs to build this sample)
$(info -----------------------------------------------------------------------------------------------)
SAMPLE_ENABLED := 0
endif
$(shell rm test.o test.c 2>/dev/null)
endif

View File

@ -0,0 +1,231 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// DESCRIPTION: Simple EGL stream sample app
//
//
//#define EGL_EGLEXT_PROTOTYPES
#include "cudaEGL.h"
#include "cuda_consumer.h"
#include "cuda_producer.h"
#include "eglstrm_common.h"
/* ------ globals ---------*/
#if defined(EXTENSION_LIST)
EXTENSION_LIST(EXTLST_EXTERN)
#endif
#define NUM_TRAILS 4
bool signal_stop = 0;
static void sig_handler(int sig) {
signal_stop = 1;
printf("Signal: %d\n", sig);
}
int main(int argc, char **argv) {
TestArgs args;
CUresult curesult = CUDA_SUCCESS;
unsigned int i, j;
EGLint streamState = 0;
test_cuda_consumer_s cudaConsumer;
test_cuda_producer_s cudaProducer;
memset(&cudaProducer, 0, sizeof(test_cuda_producer_s));
memset(&cudaConsumer, 0, sizeof(test_cuda_consumer_s));
// Hook up Ctrl-C handler
signal(SIGINT, sig_handler);
if (!eglSetupExtensions()) {
printf("SetupExtentions failed \n");
curesult = CUDA_ERROR_UNKNOWN;
goto done;
}
checkCudaErrors(cuInit(0));
int count;
checkCudaErrors(cuDeviceGetCount(&count));
printf("Found %d cuda devices\n", count);
CUdevice devId;
if (!EGLStreamInit(&devId)) {
printf("EGLStream Init failed.\n");
curesult = CUDA_ERROR_UNKNOWN;
goto done;
}
curesult = cudaDeviceCreateProducer(&cudaProducer, devId);
if (curesult != CUDA_SUCCESS) {
goto done;
}
curesult = cudaDeviceCreateConsumer(&cudaConsumer, devId);
if (curesult != CUDA_SUCCESS) {
goto done;
}
checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context));
if (CUDA_SUCCESS != (curesult = cuEGLStreamConsumerConnect(
&(cudaConsumer.cudaConn), eglStream))) {
printf("FAILED Connect CUDA consumer with error %d\n", curesult);
goto done;
} else {
printf("Connected CUDA consumer, CudaConsumer %p\n", cudaConsumer.cudaConn);
}
checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context));
checkCudaErrors(cuCtxPushCurrent(cudaProducer.context));
if (CUDA_SUCCESS ==
(curesult = cuEGLStreamProducerConnect(&(cudaProducer.cudaConn),
eglStream, WIDTH, HEIGHT))) {
printf("Connect CUDA producer Done, CudaProducer %p\n",
cudaProducer.cudaConn);
} else {
printf("Connect CUDA producer FAILED with error %d\n", curesult);
goto done;
}
checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context));
// Initialize producer
for (i = 0; i < NUM_TRAILS; i++) {
if (streamState != EGL_STREAM_STATE_CONNECTING_KHR) {
if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR,
&streamState)) {
printf("main: eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
curesult = CUDA_ERROR_UNKNOWN;
goto done;
}
}
args.inputWidth = WIDTH;
args.inputHeight = HEIGHT;
if (i % 2 != 0) {
args.isARGB = 1;
args.infile1 = sdkFindFilePath("cuda_f_1.yuv", argv[0]);
args.infile2 = sdkFindFilePath("cuda_f_2.yuv", argv[0]);
} else {
args.isARGB = 0;
args.infile1 = sdkFindFilePath("cuda_yuv_f_1.yuv", argv[0]);
args.infile2 = sdkFindFilePath("cuda_yuv_f_2.yuv", argv[0]);
}
if ((i % 4) < 2) {
args.pitchLinearOutput = 1;
} else {
args.pitchLinearOutput = 0;
}
checkCudaErrors(cuCtxPushCurrent(cudaProducer.context));
cudaProducerInit(&cudaProducer, g_display, eglStream, &args);
checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context));
checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context));
cuda_consumer_init(&cudaConsumer, &args);
checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context));
printf("main - Cuda Producer and Consumer Initialized.\n");
for (j = 0; j < 2; j++) {
printf("Running for %s frame and %s input\n",
args.isARGB ? "ARGB" : "YUV",
args.pitchLinearOutput ? "Pitchlinear" : "BlockLinear");
if (j == 0) {
checkCudaErrors(cuCtxPushCurrent(cudaProducer.context));
curesult = cudaProducerTest(&cudaProducer, cudaProducer.fileName1);
if (curesult != CUDA_SUCCESS) {
printf("Cuda Producer Test failed for frame = %d\n", j + 1);
goto done;
}
checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context));
checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context));
curesult = cudaConsumerTest(&cudaConsumer, cudaConsumer.outFile1);
if (curesult != CUDA_SUCCESS) {
printf("Cuda Consumer Test failed for frame = %d\n", j + 1);
goto done;
}
checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context));
} else {
checkCudaErrors(cuCtxPushCurrent(cudaProducer.context));
curesult = cudaProducerTest(&cudaProducer, cudaProducer.fileName2);
if (curesult != CUDA_SUCCESS) {
printf("Cuda Producer Test failed for frame = %d\n", j + 1);
goto done;
}
checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context));
checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context));
curesult = cudaConsumerTest(&cudaConsumer, cudaConsumer.outFile2);
if (curesult != CUDA_SUCCESS) {
printf("Cuda Consumer Test failed for frame = %d\n", j + 1);
goto done;
}
checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context));
}
}
}
checkCudaErrors(cuCtxPushCurrent(cudaProducer.context));
if (CUDA_SUCCESS != (curesult = cudaProducerDeinit(&cudaProducer))) {
printf("Producer Disconnect FAILED. \n");
goto done;
}
checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context));
if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR,
&streamState)) {
printf("Cuda consumer, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
curesult = CUDA_ERROR_UNKNOWN;
goto done;
}
if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) {
if (CUDA_SUCCESS != (curesult = cuda_consumer_deinit(&cudaConsumer))) {
printf("Consumer Disconnect FAILED.\n");
goto done;
}
}
printf("Producer and Consumer Disconnected \n");
done:
if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR,
&streamState)) {
printf("Cuda consumer, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
curesult = CUDA_ERROR_UNKNOWN;
}
if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) {
EGLStreamFini();
}
if (curesult == CUDA_SUCCESS) {
printf("&&&& EGLStream interop test PASSED\n");
} else {
printf("&&&& EGLStream interop test FAILED\n");
}
return 0;
}

View File

@ -0,0 +1,322 @@
################################################################################
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
endif
ifeq ($(TARGET_OS),qnx)
CCFLAGS += -DWIN_INTERFACE_CUSTOM
LDFLAGS += -lsocket
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
# This sample is not supported on ARMv7
ifeq ($(TARGET_ARCH),armv7l)
$(info >>> WARNING - NV12toBGRandResize is not supported on ARMv7 - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../Common
LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
SMS ?= 30 35 37 50 52 60 61 70 72 75
else
SMS ?= 30 35 37 50 52 60 61 70 75
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: NV12toBGRandResize
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
bgr_resize.o:bgr_resize.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
nv12_resize.o:nv12_resize.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
nv12_to_bgr_planar.o:nv12_to_bgr_planar.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
resize_convert_main.o:resize_convert_main.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
utils.o:utils.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
NV12toBGRandResize: bgr_resize.o nv12_resize.o nv12_to_bgr_planar.o resize_convert_main.o utils.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./NV12toBGRandResize
clean:
rm -f NV12toBGRandResize bgr_resize.o nv12_resize.o nv12_to_bgr_planar.o resize_convert_main.o utils.o
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/NV12toBGRandResize
clobber: clean

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2012
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NV12toBGRandResize", "NV12toBGRandResize_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,112 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>NV12toBGRandResize_vs2012</RootNamespace>
<ProjectName>NV12toBGRandResize</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v110</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="bgr_resize.cu" />
<CudaCompile Include="nv12_resize.cu" />
<CudaCompile Include="nv12_to_bgr_planar.cu" />
<ClCompile Include="resize_convert_main.cpp" />
<CudaCompile Include="utils.cu" />
<ClInclude Include="resize_convert.h" />
<ClInclude Include="utils.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 13.00
# Visual Studio 2013
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NV12toBGRandResize", "NV12toBGRandResize_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,112 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>NV12toBGRandResize_vs2013</RootNamespace>
<ProjectName>NV12toBGRandResize</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="bgr_resize.cu" />
<CudaCompile Include="nv12_resize.cu" />
<CudaCompile Include="nv12_to_bgr_planar.cu" />
<ClCompile Include="resize_convert_main.cpp" />
<CudaCompile Include="utils.cu" />
<ClInclude Include="resize_convert.h" />
<ClInclude Include="utils.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 14.00
# Visual Studio 2015
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NV12toBGRandResize", "NV12toBGRandResize_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,112 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>NV12toBGRandResize_vs2015</RootNamespace>
<ProjectName>NV12toBGRandResize</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v140</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="bgr_resize.cu" />
<CudaCompile Include="nv12_resize.cu" />
<CudaCompile Include="nv12_to_bgr_planar.cu" />
<ClCompile Include="resize_convert_main.cpp" />
<CudaCompile Include="utils.cu" />
<ClInclude Include="resize_convert.h" />
<ClInclude Include="utils.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2017
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NV12toBGRandResize", "NV12toBGRandResize_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,117 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>NV12toBGRandResize_vs2017</RootNamespace>
<ProjectName>NV12toBGRandResize</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="bgr_resize.cu" />
<CudaCompile Include="nv12_resize.cu" />
<CudaCompile Include="nv12_to_bgr_planar.cu" />
<ClCompile Include="resize_convert_main.cpp" />
<CudaCompile Include="utils.cu" />
<ClInclude Include="resize_convert.h" />
<ClInclude Include="utils.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NV12toBGRandResize", "NV12toBGRandResize_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,113 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>NV12toBGRandResize_vs2019</RootNamespace>
<ProjectName>NV12toBGRandResize</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="bgr_resize.cu" />
<CudaCompile Include="nv12_resize.cu" />
<CudaCompile Include="nv12_to_bgr_planar.cu" />
<ClCompile Include="resize_convert_main.cpp" />
<CudaCompile Include="utils.cu" />
<ClInclude Include="resize_convert.h" />
<ClInclude Include="utils.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,70 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>NV12toBGRandResize</name>
<cuda_api_list>
<toolkit>cudaMemcpy2D</toolkit>
<toolkit>cudaMallocManaged</toolkit>
</cuda_api_list>
<description><![CDATA[This code shows two ways to convert and resize NV12 frames to BGR 3 planars frames using CUDA in batch. Way-1, Convert NV12 Input to BGR @ Input Resolution-1, then Resize to Resolution#2. Way-2, resize NV12 Input to Resolution#2 then convert it to BGR Output. NVIDIA HW Decoder, both dGPU and Tegra, normally outputs NV12 pitch format frames. For the inference using TensorRT, the input frame needs to be BGR planar format with possibly different size. So, conversion and resizing from NV12 to BGR planar is usually required for the inference following decoding. This CUDA code provides a reference implementation for conversion and resizing.]]></description>
<devicecompilation>whole</devicecompilation>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../common/inc</path>
</includepaths>
<keyconcepts>
<concept level="basic">Graphics Interop</concept>
<concept level="basic">Image Processing</concept>
<concept level="basic">Video Processing</concept>
</keyconcepts>
<keywords>
<keyword>GPGPU</keyword>
</keywords>
<libraries>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>resize_convert_main.cpp</primary_file>
<scopes>
<scope>1:CUDA Basic Topics</scope>
<scope>2:Image Processing</scope>
<scope>2:Computer Vision</scope>
</scopes>
<sm-arch>sm30</sm-arch>
<sm-arch>sm35</sm-arch>
<sm-arch>sm37</sm-arch>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<platform>windows7</platform>
</env>
<env>
<arch>x86_64</arch>
<platform>macosx</platform>
</env>
<env>
<arch>aarch64</arch>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<include>all</include>
</supported_sm_architectures>
<title>NV12toBGRandResize</title>
<type>exe</type>
</entry>

View File

@ -0,0 +1,94 @@
# NV12toBGRandResize - NV12toBGRandResize
## Description
This code shows two ways to convert and resize NV12 frames to BGR 3 planars frames using CUDA in batch. Way-1, Convert NV12 Input to BGR @ Input Resolution-1, then Resize to Resolution#2. Way-2, resize NV12 Input to Resolution#2 then convert it to BGR Output. NVIDIA HW Decoder, both dGPU and Tegra, normally outputs NV12 pitch format frames. For the inference using TensorRT, the input frame needs to be BGR planar format with possibly different size. So, conversion and resizing from NV12 to BGR planar is usually required for the inference following decoding. This CUDA code provides a reference implementation for conversion and resizing.
## Key Concepts
Graphics Interop, Image Processing, Video Processing
## Supported SM Architectures
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
Linux, Windows, MacOSX
## Supported CPU Architecture
x86_64, ppc64le, aarch64
## CUDA APIs involved
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
cudaMemcpy2D, cudaMallocManaged
## Prerequisites
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
### Mac
The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
```
$ make SMS="A B ..."
```
* **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=clang
```
## References (for more details)

View File

@ -0,0 +1,134 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Implements BGR 3 progressive planars frames batch resize
#include <cuda.h>
#include <cuda_runtime.h>
#include "resize_convert.h"
__global__ void resizeBGRplanarBatchKernel(cudaTextureObject_t texSrc,
float *pDst, int nDstPitch, int nDstHeight, int nSrcHeight,
int batch, float scaleX, float scaleY,
int cropX, int cropY, int cropW, int cropH) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x >= (int)(cropW/scaleX) || y >= (int)(cropH/scaleY))
return;
int frameSize = nDstPitch*nDstHeight;
float *p = NULL;
for (int i = blockIdx.z; i < batch; i += gridDim.z) {
#pragma unroll
for (int channel=0; channel < 3; channel++){
p = pDst + i * 3 * frameSize + y * nDstPitch + x + channel * frameSize;
*p = tex2D<float>(texSrc, x * scaleX + cropX,
((3 * i + channel) * nSrcHeight + y * scaleY + cropY));
}
}
}
static void resizeBGRplanarBatchCore(
float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight,
float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight,
int nBatchSize, cudaStream_t stream, bool whSameResizeRatio,
int cropX, int cropY, int cropW, int cropH) {
cudaTextureObject_t texSrc[2];
int nTiles = 1, h, iTile;
h = nSrcHeight * 3 * nBatchSize;
while ((h + nTiles - 1) / nTiles > 65536)
nTiles++;
if (nTiles > 2)
return;
int batchTile = nBatchSize / nTiles;
int batchTileLast = nBatchSize - batchTile * (nTiles-1);
for (iTile = 0; iTile < nTiles; ++iTile) {
int bs = (iTile == nTiles - 1) ? batchTileLast : batchTile;
float *dpSrcNew = dpSrc +
iTile * (batchTile * 3 * nSrcHeight * nSrcPitch);
cudaResourceDesc resDesc = {};
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = dpSrcNew;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
resDesc.res.pitch2D.width = nSrcWidth;
resDesc.res.pitch2D.height = bs * 3 * nSrcHeight;
resDesc.res.pitch2D.pitchInBytes = nSrcPitch * sizeof(float);
cudaTextureDesc texDesc = {};
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&texSrc[iTile], &resDesc, &texDesc, NULL));
float *dpDstNew = dpDst +
iTile * (batchTile * 3 * nDstHeight * nDstPitch);
if(cropW == 0 || cropH == 0) {
cropX = 0;
cropY = 0;
cropW = nSrcWidth;
cropH = nSrcHeight;
}
float scaleX = (cropW*1.0f / nDstWidth);
float scaleY = (cropH*1.0f / nDstHeight);
if(whSameResizeRatio == true)
scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
dim3 block(32, 32, 1);
size_t blockDimZ = bs;
// Restricting blocks in Z-dim till 32 to not launch too many blocks
blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ;
dim3 grid((cropW*1.0f/scaleX + block.x - 1) / block.x,
(cropH*1.0f/scaleY + block.y - 1) / block.y, blockDimZ);
resizeBGRplanarBatchKernel<<<grid, block, 0, stream>>>
(texSrc[iTile], dpDstNew, nDstPitch, nDstHeight, nSrcHeight,
bs, scaleX, scaleY, cropX, cropY, cropW, cropH);
}
for (iTile = 0; iTile < nTiles; ++iTile)
checkCudaErrors(cudaDestroyTextureObject(texSrc[iTile]));
}
void resizeBGRplanarBatch(
float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight,
float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight,
int nBatchSize, cudaStream_t stream,
int cropX, int cropY, int cropW, int cropH, bool whSameResizeRatio) {
resizeBGRplanarBatchCore(dpSrc, nSrcPitch, nSrcWidth, nSrcHeight,
dpDst, nDstPitch, nDstWidth, nDstHeight, nBatchSize, stream,
whSameResizeRatio, cropX, cropY, cropW, cropH);
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,112 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Implements interlace NV12 frames batch resize
#include <cuda.h>
#include <cuda_runtime.h>
#include "resize_convert.h"
__global__ static void resizeNV12BatchKernel(cudaTextureObject_t texSrcLuma,
cudaTextureObject_t texSrcChroma,
uint8_t *pDstNv12, int nSrcWidth,
int nSrcHeight, int nDstPitch,
int nDstWidth, int nDstHeight,
int nBatchSize) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int px = x * 2, py = y * 2;
if ((px + 1) >= nDstWidth || (py + 1) >= nDstHeight) return;
float fxScale = 1.0f * nSrcWidth / nDstWidth;
float fyScale = 1.0f * nSrcHeight / nDstHeight;
uint8_t *p = pDstNv12 + px + py * nDstPitch;
int hh = nDstHeight * 3 / 2;
int nByte = nDstPitch * hh;
int px_fxScale = px * fxScale;
int px_fxScale_1 = (px + 1) * fxScale;
int py_fyScale = py * fyScale;
int py_fyScale_1 = (py + 1) * fyScale;
for (int i = blockIdx.z; i < nBatchSize; i+=gridDim.z) {
*(uchar2 *)p = make_uchar2(tex2D<uint8_t>(texSrcLuma, px_fxScale, py_fyScale),
tex2D<uint8_t>(texSrcLuma, px_fxScale_1, py_fyScale));
*(uchar2 *)(p + nDstPitch) =
make_uchar2(tex2D<uint8_t>(texSrcLuma, px_fxScale, py_fyScale_1),
tex2D<uint8_t>(texSrcLuma, px_fxScale_1, py_fyScale_1));
*(uchar2 *)(p + (nDstHeight - y) * nDstPitch) = tex2D<uchar2>(
texSrcChroma, x * fxScale, (hh * i + nDstHeight + y) * fyScale);
p += nByte;
py += hh;
}
}
void resizeNV12Batch(uint8_t *dpSrc, int nSrcPitch, int nSrcWidth,
int nSrcHeight, uint8_t *dpDst, int nDstPitch,
int nDstWidth, int nDstHeight, int nBatchSize,
cudaStream_t stream) {
int hhSrc = ceilf(nSrcHeight * 3.0f / 2.0f);
cudaResourceDesc resDesc = {};
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = dpSrc;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uint8_t>();
resDesc.res.pitch2D.width = nSrcWidth;
resDesc.res.pitch2D.height = hhSrc * nBatchSize;
resDesc.res.pitch2D.pitchInBytes = nSrcPitch;
cudaTextureDesc texDesc = {};
texDesc.filterMode = cudaFilterModePoint;
texDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t texLuma = 0;
checkCudaErrors(cudaCreateTextureObject(&texLuma, &resDesc, &texDesc, NULL));
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uchar2>();
resDesc.res.pitch2D.width /= 2;
cudaTextureObject_t texChroma = 0;
checkCudaErrors(cudaCreateTextureObject(&texChroma, &resDesc, &texDesc, NULL));
dim3 block(32, 32, 1);
size_t blockDimZ = nBatchSize;
// Restricting blocks in Z-dim till 32 to not launch too many blocks
blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ;
dim3 grid((nDstWidth / 2 + block.x) / block.x,
(nDstHeight / 2 + block.y) / block.y, blockDimZ);
resizeNV12BatchKernel<<<grid, block, 0, stream>>>(
texLuma, texChroma, dpDst, nSrcWidth, nSrcHeight, nDstPitch, nDstWidth,
nDstHeight, nBatchSize);
checkCudaErrors(cudaDestroyTextureObject(texLuma));
checkCudaErrors(cudaDestroyTextureObject(texChroma));
}

View File

@ -0,0 +1,154 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Implements NV12 to BGR batch conversion
#include <cuda.h>
#include <cuda_runtime.h>
#include "resize_convert.h"
#define CONV_THREADS_X 64
#define CONV_THREADS_Y 10
__forceinline__ __device__ static float clampF(float x, float lower,
float upper) {
return x < lower ? lower : (x > upper ? upper : x);
}
__global__ static void nv12ToBGRplanarBatchKernel(const uint8_t *pNv12,
int nNv12Pitch, float *pBgr,
int nRgbPitch, int nWidth,
int nHeight, int nBatchSize) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if ((x << 2) + 1 > nWidth || (y << 1) + 1 > nHeight) return;
const uint8_t *__restrict__ pSrc = pNv12;
for (int i = blockIdx.z; i < nBatchSize; i += gridDim.z) {
pSrc = pNv12 + i * ((nHeight * nNv12Pitch * 3) >> 1) + (x << 2) +
(y << 1) * nNv12Pitch;
uchar4 luma2x01, luma2x23, uv2;
*(uint32_t *)&luma2x01 = *(uint32_t *)pSrc;
*(uint32_t *)&luma2x23 = *(uint32_t *)(pSrc + nNv12Pitch);
*(uint32_t *)&uv2 = *(uint32_t *)(pSrc + (nHeight - y) * nNv12Pitch);
float *pDstBlock = (pBgr + i * ((nHeight * nRgbPitch * 3) >> 2) +
((blockIdx.x * blockDim.x) << 2) +
((blockIdx.y * blockDim.y) << 1) * (nRgbPitch >> 2));
float2 add1;
float2 add2;
float2 add3;
float2 add00, add01, add02, add03;
float2 d, e;
add00.x = 1.1644f * luma2x01.x;
add01.x = 1.1644f * luma2x01.y;
add00.y = 1.1644f * luma2x01.z;
add01.y = 1.1644f * luma2x01.w;
add02.x = 1.1644f * luma2x23.x;
add03.x = 1.1644f * luma2x23.y;
add02.y = 1.1644f * luma2x23.z;
add03.y = 1.1644f * luma2x23.w;
d.x = uv2.x - 128.0f;
e.x = uv2.y - 128.0f;
d.y = uv2.z - 128.0f;
e.y = uv2.w - 128.0f;
add1.x = 2.0172f * d.x;
add1.y = 2.0172f * d.y;
add2.x = (-0.3918f) * d.x + (-0.8130f) * e.x;
add2.y = (-0.3918f) * d.y + (-0.8130f) * e.y;
add3.x = 1.5960f * e.x;
add3.y = 1.5960f * e.y;
int rowStride = (threadIdx.y << 1) * (nRgbPitch >> 2);
int nextRowStride = ((threadIdx.y << 1) + 1) * (nRgbPitch >> 2);
// B
*((float4 *)&pDstBlock[rowStride + (threadIdx.x << 2)]) =
make_float4(clampF(add00.x + add1.x, 0.0f, 255.0f),
clampF(add01.x + add1.x, 0.0f, 255.0f),
clampF(add00.y + add1.y, 0.0f, 255.0f),
clampF(add01.y + add1.y, 0.0f, 255.0f));
*((float4 *)&pDstBlock[nextRowStride + (threadIdx.x << 2)]) =
make_float4(clampF(add02.x + add1.x, 0.0f, 255.0f),
clampF(add03.x + add1.x, 0.0f, 255.0f),
clampF(add02.y + add1.y, 0.0f, 255.0f),
clampF(add03.y + add1.y, 0.0f, 255.0f));
int planeStride = nHeight * nRgbPitch >> 2;
// G
*((float4 *)&pDstBlock[planeStride + rowStride + (threadIdx.x << 2)]) =
make_float4(clampF(add00.x + add2.x, 0.0f, 255.0f),
clampF(add01.x + add2.x, 0.0f, 255.0f),
clampF(add00.y + add2.y, 0.0f, 255.0f),
clampF(add01.y + add2.y, 0.0f, 255.0f));
*((float4 *)&pDstBlock[planeStride + nextRowStride + (threadIdx.x << 2)]) =
make_float4(clampF(add02.x + add2.x, 0.0f, 255.0f),
clampF(add03.x + add2.x, 0.0f, 255.0f),
clampF(add02.y + add2.y, 0.0f, 255.0f),
clampF(add03.y + add2.y, 0.0f, 255.0f));
// R
*((float4
*)&pDstBlock[(planeStride << 1) + rowStride + (threadIdx.x << 2)]) =
make_float4(clampF(add00.x + add3.x, 0.0f, 255.0f),
clampF(add01.x + add3.x, 0.0f, 255.0f),
clampF(add00.y + add3.y, 0.0f, 255.0f),
clampF(add01.y + add3.y, 0.0f, 255.0f));
*((float4 *)&pDstBlock[(planeStride << 1) + nextRowStride +
(threadIdx.x << 2)]) =
make_float4(clampF(add02.x + add3.x, 0.0f, 255.0f),
clampF(add03.x + add3.x, 0.0f, 255.0f),
clampF(add02.y + add3.y, 0.0f, 255.0f),
clampF(add03.y + add3.y, 0.0f, 255.0f));
}
}
void nv12ToBGRplanarBatch(uint8_t *pNv12, int nNv12Pitch, float *pBgr,
int nRgbPitch, int nWidth, int nHeight,
int nBatchSize, cudaStream_t stream) {
dim3 threads(CONV_THREADS_X, CONV_THREADS_Y);
size_t blockDimZ = nBatchSize;
// Restricting blocks in Z-dim till 32 to not launch too many blocks
blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ;
dim3 blocks((nWidth / 4 - 1) / threads.x + 1,
(nHeight / 2 - 1) / threads.y + 1, blockDimZ);
nv12ToBGRplanarBatchKernel<<<blocks, threads, 0, stream>>>(
pNv12, nNv12Pitch, pBgr, nRgbPitch, nWidth, nHeight, nBatchSize);
}

View File

@ -0,0 +1,56 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __H_RESIZE_CONVERT__
#define __H_RESIZE_CONVERT__
#include <iostream>
#include <helper_cuda.h>
// nv12 resize
extern "C"
void resizeNV12Batch(
uint8_t *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight,
uint8_t *dpDst, int nDstPitch, int nDstWidth, int nDstHeight,
int nBatchSize, cudaStream_t stream = 0);
// bgr resize
extern "C"
void resizeBGRplanarBatch(
float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight,
float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight,
int nBatchSize, cudaStream_t stream = 0,
int cropX = 0, int cropY = 0, int cropW = 0, int cropH = 0,
bool whSameResizeRatio = false);
//NV12 to bgr planar
extern "C"
void nv12ToBGRplanarBatch(uint8_t *pNv12, int nNv12Pitch,
float *pRgb, int nRgbPitch, int nWidth, int nHeight,
int nBatchSize, cudaStream_t stream=0);
#endif

View File

@ -0,0 +1,448 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
NVIDIA HW Decoder, both dGPU and Tegra, normally outputs NV12 pitch format
frames. For the inference using TensorRT, the input frame needs to be BGR planar
format with possibly different size. So, conversion and resizing from NV12 to
BGR planar is usually required for the inference following decoding.
This CUDA code is to provide a reference implementation for conversion and
resizing.
Limitaion
=========
NV12resize needs the height to be a even value.
Note
====
Resize function needs the pitch of image buffer to be 32 alignment.
Run
====
./NV12toBGRandResize
OR
./NV12toBGRandResize -input=data/test1920x1080.nv12 -width=1920 -height=1080 \
-dst_width=640 -dst_height=480 -batch=40 -device=0
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cassert>
#include <fstream>
#include <iostream>
#include <memory>
#include "resize_convert.h"
#include "utils.h"
#define TEST_LOOP 20
typedef struct _nv12_to_bgr24_context_t {
int width;
int height;
int pitch;
int dst_width;
int dst_height;
int dst_pitch;
int batch;
int device; // cuda device ID
char *input_nv12_file;
int ctx_pitch; // the value will be suitable for Texture memroy.
int ctx_heights; // the value will be even.
} nv12_to_bgr24_context;
nv12_to_bgr24_context g_ctx;
static void printHelp(const char *app_name) {
std::cout << "Usage:" << app_name << " [options]\n\n";
std::cout << "OPTIONS:\n";
std::cout << "\t-h,--help\n\n";
std::cout << "\t-input=nv12file nv12 input file\n";
std::cout
<< "\t-width=width input nv12 image width, <1 -- 4096>\n";
std::cout
<< "\t-height=height input nv12 image height, <1 -- 4096>\n";
std::cout
<< "\t-pitch=pitch(optional) input nv12 image pitch, <0 -- 4096>\n";
std::cout
<< "\t-dst_width=width output BGR image width, <1 -- 4096>\n";
std::cout
<< "\t-dst_height=height output BGR image height, <1 -- 4096>\n";
std::cout
<< "\t-dst_pitch=pitch(optional) output BGR image pitch, <0 -- 4096>\n";
std::cout
<< "\t-batch=batch process frames count, <1 -- 4096>\n\n";
std::cout
<< "\t-device=device_num(optional) cuda device number, <0 -- 4096>\n\n";
return;
}
int parseCmdLine(int argc, char *argv[]) {
char **argp = (char **)argv;
char *arg = (char *)argv[0];
memset(&g_ctx, 0, sizeof(g_ctx));
if ((arg && (!strcmp(arg, "-h") || !strcmp(arg, "--help")))) {
printHelp(argv[0]);
return -1;
}
if (argc == 1) {
// Run using default arguments
g_ctx.input_nv12_file = sdkFindFilePath("test1920x1080.nv12", argv[0]);
if (g_ctx.input_nv12_file == NULL) {
printf("Cannot find input file test1920x1080.nv12\n Exiting\n");
return EXIT_FAILURE;
}
g_ctx.width = 1920;
g_ctx.height = 1080;
g_ctx.dst_width = 640;
g_ctx.dst_height = 480;
g_ctx.batch = 24;
} else if (argc > 1) {
if (checkCmdLineFlag(argc, (const char **)argv, "width")) {
g_ctx.width = getCmdLineArgumentInt(argc, (const char **)argv, "width");
}
if (checkCmdLineFlag(argc, (const char **)argv, "height")) {
g_ctx.height = getCmdLineArgumentInt(argc, (const char **)argv, "height");
}
if (checkCmdLineFlag(argc, (const char **)argv, "pitch")) {
g_ctx.pitch = getCmdLineArgumentInt(argc, (const char **)argv, "pitch");
}
if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
getCmdLineArgumentString(argc, (const char **)argv, "input",
(char **)&g_ctx.input_nv12_file);
}
if (checkCmdLineFlag(argc, (const char **)argv, "dst_width")) {
g_ctx.dst_width =
getCmdLineArgumentInt(argc, (const char **)argv, "dst_width");
}
if (checkCmdLineFlag(argc, (const char **)argv, "dst_height")) {
g_ctx.dst_height =
getCmdLineArgumentInt(argc, (const char **)argv, "dst_height");
}
if (checkCmdLineFlag(argc, (const char **)argv, "dst_pitch")) {
g_ctx.dst_pitch =
getCmdLineArgumentInt(argc, (const char **)argv, "dst_pitch");
}
if (checkCmdLineFlag(argc, (const char **)argv, "batch")) {
g_ctx.batch = getCmdLineArgumentInt(argc, (const char **)argv, "batch");
}
}
g_ctx.device = findCudaDevice(argc, (const char **)argv);
if ((g_ctx.width == 0) || (g_ctx.height == 0) || (g_ctx.dst_width == 0) ||
(g_ctx.dst_height == 0) || !g_ctx.input_nv12_file) {
printHelp(argv[0]);
return -1;
}
if (g_ctx.pitch == 0) g_ctx.pitch = g_ctx.width;
if (g_ctx.dst_pitch == 0) g_ctx.dst_pitch = g_ctx.dst_width;
return 0;
}
/*
load nv12 yuvfile data into GPU device memory with batch of copy
*/
static int loadNV12Frame(unsigned char *d_inputNV12) {
unsigned char *pNV12FrameData;
unsigned char *d_nv12;
int frameSize;
std::ifstream nv12File(g_ctx.input_nv12_file, std::ifstream::in | std::ios::binary);
if (!nv12File.is_open()) {
std::cerr << "Can't open files\n";
return -1;
}
frameSize = g_ctx.pitch * g_ctx.ctx_heights;
#if USE_UVM_MEM
pNV12FrameData = d_inputNV12;
#else
pNV12FrameData = (unsigned char *)malloc(frameSize);
if (pNV12FrameData == NULL) {
std::cerr << "Failed to malloc pNV12FrameData\n";
return -1;
}
#endif
nv12File.read((char *)pNV12FrameData, frameSize);
if (nv12File.gcount() < frameSize) {
std::cerr << "can't get one frame!\n";
return -1;
}
#if USE_UVM_MEM
// Prefetch to GPU for following GPU operation
cudaStreamAttachMemAsync(NULL, pNV12FrameData, 0, cudaMemAttachGlobal);
#endif
// expand one frame to multi frames for batch processing
d_nv12 = d_inputNV12;
for (int i = 0; i < g_ctx.batch; i++) {
checkCudaErrors(cudaMemcpy2D((void *)d_nv12, g_ctx.ctx_pitch,
pNV12FrameData, g_ctx.width, g_ctx.width,
g_ctx.ctx_heights, cudaMemcpyHostToDevice));
d_nv12 += g_ctx.ctx_pitch * g_ctx.ctx_heights;
}
#if (USE_UVM_MEM == 0)
free(pNV12FrameData);
#endif
nv12File.close();
return 0;
}
/*
1. resize interlace nv12 to target size
2. convert nv12 to bgr 3 progressive planars
*/
void nv12ResizeAndNV12ToBGR(unsigned char *d_inputNV12) {
unsigned char *d_resizedNV12;
float *d_outputBGR;
int size;
char filename[40];
/* allocate device memory for resized nv12 output */
size = g_ctx.dst_width * ceil(g_ctx.dst_height * 3.0f / 2.0f) * g_ctx.batch *
sizeof(unsigned char);
checkCudaErrors(cudaMalloc((void **)&d_resizedNV12, size));
/* allocate device memory for bgr output */
size = g_ctx.dst_pitch * g_ctx.dst_height * 3 * g_ctx.batch * sizeof(float);
checkCudaErrors(cudaMalloc((void **)&d_outputBGR, size));
cudaStream_t stream;
checkCudaErrors(cudaStreamCreate(&stream));
/* create cuda event handles */
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
float elapsedTime = 0.0f;
/* resize interlace nv12 */
cudaEventRecord(start, 0);
for (int i = 0; i < TEST_LOOP; i++) {
resizeNV12Batch(d_inputNV12, g_ctx.ctx_pitch, g_ctx.width, g_ctx.height,
d_resizedNV12, g_ctx.dst_width, g_ctx.dst_width,
g_ctx.dst_height, g_ctx.batch);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf(
" CUDA resize nv12(%dx%d --> %dx%d), batch: %d,"
" average time: %.3f ms ==> %.3f ms/frame\n",
g_ctx.width, g_ctx.height, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch,
(elapsedTime / (TEST_LOOP * 1.0f)),
(elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch);
sprintf(filename, "resized_nv12_%dx%d", g_ctx.dst_width, g_ctx.dst_height);
/* convert nv12 to bgr 3 progressive planars */
cudaEventRecord(start, 0);
for (int i = 0; i < TEST_LOOP; i++) {
nv12ToBGRplanarBatch(d_resizedNV12, g_ctx.dst_pitch, // intput
d_outputBGR,
g_ctx.dst_pitch * sizeof(float), // output
g_ctx.dst_width, g_ctx.dst_height, // output
g_ctx.batch, 0);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf(
" CUDA convert nv12(%dx%d) to bgr(%dx%d), batch: %d,"
" average time: %.3f ms ==> %.3f ms/frame\n",
g_ctx.dst_width, g_ctx.dst_height, g_ctx.dst_width, g_ctx.dst_height,
g_ctx.batch, (elapsedTime / (TEST_LOOP * 1.0f)),
(elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch);
sprintf(filename, "converted_bgr_%dx%d", g_ctx.dst_width, g_ctx.dst_height);
dumpBGR(d_outputBGR, g_ctx.dst_pitch, g_ctx.dst_width, g_ctx.dst_height,
g_ctx.batch, (char *)"t1", filename);
/* release resources */
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaStreamDestroy(stream));
checkCudaErrors(cudaFree(d_resizedNV12));
checkCudaErrors(cudaFree(d_outputBGR));
}
/*
1. convert nv12 to bgr 3 progressive planars
2. resize bgr 3 planars to target size
*/
void nv12ToBGRandBGRresize(unsigned char *d_inputNV12) {
float *d_bgr;
float *d_resizedBGR;
int size;
char filename[40];
/* allocate device memory for bgr output */
size = g_ctx.ctx_pitch * g_ctx.height * 3 * g_ctx.batch * sizeof(float);
checkCudaErrors(cudaMalloc((void **)&d_bgr, size));
/* allocate device memory for resized bgr output */
size = g_ctx.dst_width * g_ctx.dst_height * 3 * g_ctx.batch * sizeof(float);
checkCudaErrors(cudaMalloc((void **)&d_resizedBGR, size));
cudaStream_t stream;
checkCudaErrors(cudaStreamCreate(&stream));
/* create cuda event handles */
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
float elapsedTime = 0.0f;
/* convert interlace nv12 to bgr 3 progressive planars */
cudaEventRecord(start, 0);
cudaDeviceSynchronize();
for (int i = 0; i < TEST_LOOP; i++) {
nv12ToBGRplanarBatch(d_inputNV12, g_ctx.ctx_pitch, d_bgr,
g_ctx.ctx_pitch * sizeof(float), g_ctx.width,
g_ctx.height, g_ctx.batch, 0);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf(
" CUDA convert nv12(%dx%d) to bgr(%dx%d), batch: %d,"
" average time: %.3f ms ==> %.3f ms/frame\n",
g_ctx.width, g_ctx.height, g_ctx.width, g_ctx.height, g_ctx.batch,
(elapsedTime / (TEST_LOOP * 1.0f)),
(elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch);
sprintf(filename, "converted_bgr_%dx%d", g_ctx.width, g_ctx.height);
/* resize bgr 3 progressive planars */
cudaEventRecord(start, 0);
for (int i = 0; i < TEST_LOOP; i++) {
resizeBGRplanarBatch(d_bgr, g_ctx.ctx_pitch, g_ctx.width, g_ctx.height,
d_resizedBGR, g_ctx.dst_width, g_ctx.dst_width,
g_ctx.dst_height, g_ctx.batch);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf(
" CUDA resize bgr(%dx%d --> %dx%d), batch: %d,"
" average time: %.3f ms ==> %.3f ms/frame\n",
g_ctx.width, g_ctx.height, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch,
(elapsedTime / (TEST_LOOP * 1.0f)),
(elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch);
memset(filename, 0, sizeof(filename));
sprintf(filename, "resized_bgr_%dx%d", g_ctx.dst_width, g_ctx.dst_height);
dumpBGR(d_resizedBGR, g_ctx.dst_pitch, g_ctx.dst_width, g_ctx.dst_height,
g_ctx.batch, (char *)"t2", filename);
/* release resources */
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaStreamDestroy(stream));
checkCudaErrors(cudaFree(d_bgr));
checkCudaErrors(cudaFree(d_resizedBGR));
}
int main(int argc, char *argv[]) {
unsigned char *d_inputNV12;
if (parseCmdLine(argc, argv) < 0) return EXIT_FAILURE;
g_ctx.ctx_pitch = g_ctx.width;
int ctx_alignment = 32;
g_ctx.ctx_pitch += (g_ctx.ctx_pitch % ctx_alignment != 0)
? (ctx_alignment - g_ctx.ctx_pitch % ctx_alignment)
: 0;
g_ctx.ctx_heights = ceil(g_ctx.height * 3.0f / 2.0f);
/* load nv12 yuv data into d_inputNV12 with batch of copies */
#if USE_UVM_MEM
checkCudaErrors(cudaMallocManaged(
(void **)&d_inputNV12,
(g_ctx.ctx_pitch * g_ctx.ctx_heights * g_ctx.batch), cudaMemAttachHost));
printf("\nUSE_UVM_MEM\n");
#else
checkCudaErrors(
cudaMalloc((void **)&d_inputNV12,
(g_ctx.ctx_pitch * g_ctx.ctx_heights * g_ctx.batch)));
#endif
if (loadNV12Frame(d_inputNV12)) {
std::cerr << "failed to load batch data!\n";
return EXIT_FAILURE;
}
/* firstly resize nv12, then convert nv12 to bgr */
printf("\nTEST#1:\n");
nv12ResizeAndNV12ToBGR(d_inputNV12);
/* first convert nv12 to bgr, then resize bgr */
printf("\nTEST#2:\n");
nv12ToBGRandBGRresize(d_inputNV12);
checkCudaErrors(cudaFree(d_inputNV12));
return EXIT_SUCCESS;
}

View File

@ -0,0 +1,152 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fstream>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include "resize_convert.h"
#include "utils.h"
__global__ void floatToChar(float *src, unsigned char *dst, int height,
int width, int batchSize) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
if (x >= height * width) return;
int offset = height * width * 3;
for (int j = 0; j < batchSize; j++) {
// b
*(dst + j * offset + x * 3 + 0) =
(unsigned char)*(src + j * offset + height * width * 0 + x);
// g
*(dst + j * offset + x * 3 + 1) =
(unsigned char)*(src + j * offset + height * width * 1 + x);
// r
*(dst + j * offset + x * 3 + 2) =
(unsigned char)*(src + j * offset + height * width * 2 + x);
}
}
void floatPlanarToChar(float *src, unsigned char *dst, int height, int width,
int batchSize) {
floatToChar<<<(height * width - 1) / 1024 + 1, 1024, 0, NULL>>>(
src, dst, height, width, batchSize);
}
void dumpRawBGR(float *d_srcBGR, int pitch, int width, int height,
int batchSize, char *folder, char *tag) {
float *bgr, *d_bgr;
int frameSize;
char directory[120];
char mkdir_cmd[256];
#if !defined(_WIN32)
sprintf(directory, "output/%s", folder);
sprintf(mkdir_cmd, "mkdir -p %s 2> /dev/null", directory);
#else
sprintf(directory, "output\\%s", folder);
sprintf(mkdir_cmd, "mkdir %s 2> nul", directory);
#endif
int ret = system(mkdir_cmd);
frameSize = width * height * 3 * sizeof(float);
bgr = (float *)malloc(frameSize);
if (bgr == NULL) {
std::cerr << "Failed malloc for bgr\n";
return;
}
d_bgr = d_srcBGR;
for (int i = 0; i < batchSize; i++) {
char filename[120];
std::ofstream *outputFile;
checkCudaErrors(cudaMemcpy((void *)bgr, (void *)d_bgr, frameSize,
cudaMemcpyDeviceToHost));
sprintf(filename, "%s/%s_%d.raw", directory, tag, (i + 1));
outputFile = new std::ofstream(filename);
if (outputFile) {
outputFile->write((char *)bgr, frameSize);
delete outputFile;
}
d_bgr += pitch * height * 3;
}
free(bgr);
}
void dumpBGR(float *d_srcBGR, int pitch, int width, int height, int batchSize,
char *folder, char *tag) {
dumpRawBGR(d_srcBGR, pitch, width, height, batchSize, folder, tag);
}
void dumpYUV(unsigned char *d_nv12, int size, char *folder, char *tag) {
unsigned char *nv12Data;
std::ofstream *nv12File;
char filename[120];
char directory[60];
char mkdir_cmd[256];
#if !defined(_WIN32)
sprintf(directory, "output/%s", folder);
sprintf(mkdir_cmd, "mkdir -p %s 2> /dev/null", directory);
#else
sprintf(directory, "output\\%s", folder);
sprintf(mkdir_cmd, "mkdir %s 2> nul", directory);
#endif
int ret = system(mkdir_cmd);
sprintf(filename, "%s/%s.nv12", directory, tag);
nv12File = new std::ofstream(filename);
if (nv12File == NULL) {
std::cerr << "Failed to new " << filename;
return;
}
nv12Data = (unsigned char *)malloc(size * (sizeof(char)));
if (nv12Data == NULL) {
std::cerr << "Failed to allcoate memory\n";
return;
}
cudaMemcpy((void *)nv12Data, (void *)d_nv12, size, cudaMemcpyDeviceToHost);
nv12File->write((const char *)nv12Data, size);
free(nv12Data);
delete nv12File;
}

View File

@ -0,0 +1,37 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __H_UTIL_
#define __H_UTIL_
extern "C"
void dumpBGR(float *d_srcBGR, int pitch, int width, int height,
int batchSize, char *folder, char *tag);
extern "C"
void dumpYUV(unsigned char *d_nv12, int size, char *folder, char *tag);
#endif

View File

@ -1,5 +1,5 @@
################################################################################ ################################################################################
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# #
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions # modification, are permitted provided that the following conditions

View File

@ -19,12 +19,16 @@
<ProjectName>UnifiedMemoryPerf</ProjectName> <ProjectName>UnifiedMemoryPerf</ProjectName>
<CudaToolkitCustomDir /> <CudaToolkitCustomDir />
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup> <PropertyGroup>
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset> <PlatformToolset>v141</PlatformToolset>
<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'"> <PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryPerf", "UnifiedMemoryPerf_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,111 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>UnifiedMemoryPerf_vs2019</RootNamespace>
<ProjectName>UnifiedMemoryPerf</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/UnifiedMemoryPerf.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="commonKernels.cu" />
<ClCompile Include="helperFunctions.cpp" />
<CudaCompile Include="matrixMultiplyPerf.cu" />
<ClInclude Include="commonDefs.hpp" />
<ClInclude Include="commonKernels.hpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -1,5 +1,5 @@
################################################################################ ################################################################################
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# #
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions # modification, are permitted provided that the following conditions

View File

@ -19,12 +19,16 @@
<ProjectName>bandwidthTest</ProjectName> <ProjectName>bandwidthTest</ProjectName>
<CudaToolkitCustomDir /> <CudaToolkitCustomDir />
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup> <PropertyGroup>
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset> <PlatformToolset>v141</PlatformToolset>
<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'"> <PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,108 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>bandwidthTest_vs2019</RootNamespace>
<ProjectName>bandwidthTest</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="bandwidthTest.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -1,5 +1,5 @@
################################################################################ ################################################################################
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# #
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions # modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -19,12 +19,16 @@
<ProjectName>conjugateGradientCudaGraphs</ProjectName> <ProjectName>conjugateGradientCudaGraphs</ProjectName>
<CudaToolkitCustomDir /> <CudaToolkitCustomDir />
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup> <PropertyGroup>
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset> <PlatformToolset>v141</PlatformToolset>
<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'"> <PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,108 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>conjugateGradientCudaGraphs_vs2019</RootNamespace>
<ProjectName>conjugateGradientCudaGraphs</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="conjugateGradientCudaGraphs.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -1,5 +1,5 @@
################################################################################ ################################################################################
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# #
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions # modification, are permitted provided that the following conditions

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -19,12 +19,16 @@
<ProjectName>conjugateGradientMultiBlockCG</ProjectName> <ProjectName>conjugateGradientMultiBlockCG</ProjectName>
<CudaToolkitCustomDir /> <CudaToolkitCustomDir />
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup> <PropertyGroup>
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset> <PlatformToolset>v141</PlatformToolset>
<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'"> <PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiBlockCG", "conjugateGradientMultiBlockCG_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,109 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>conjugateGradientMultiBlockCG_vs2019</RootNamespace>
<ProjectName>conjugateGradientMultiBlockCG</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/conjugateGradientMultiBlockCG.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
<GenerateRelocatableDeviceCode>true</GenerateRelocatableDeviceCode>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="conjugateGradientMultiBlockCG.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -1,5 +1,5 @@
################################################################################ ################################################################################
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# #
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions # modification, are permitted provided that the following conditions
@ -286,7 +286,7 @@ GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif endif
endif endif
ALL_CCFLAGS += -dc ALL_CCFLAGS += -dc -maxrregcount=64
LIBRARIES += -lcudadevrt LIBRARIES += -lcudadevrt

View File

@ -4,6 +4,7 @@
<name>conjugateGradientMultiDeviceCG</name> <name>conjugateGradientMultiDeviceCG</name>
<cflags> <cflags>
<flag>-dc</flag> <flag>-dc</flag>
<flag>-maxrregcount=64</flag>
</cflags> </cflags>
<cuda_api_list> <cuda_api_list>
<toolkit>cudaMemAdvise</toolkit> <toolkit>cudaMemAdvise</toolkit>

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions

View File

@ -19,12 +19,16 @@
<ProjectName>conjugateGradientMultiDeviceCG</ProjectName> <ProjectName>conjugateGradientMultiDeviceCG</ProjectName>
<CudaToolkitCustomDir /> <CudaToolkitCustomDir />
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup> <PropertyGroup>
<ConfigurationType>Application</ConfigurationType> <ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet> <CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset> <PlatformToolset>v141</PlatformToolset>
<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'"> <PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries> <UseDebugLibraries>true</UseDebugLibraries>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiDeviceCG", "conjugateGradientMultiDeviceCG_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,109 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>conjugateGradientMultiDeviceCG_vs2019</RootNamespace>
<ProjectName>conjugateGradientMultiDeviceCG</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/conjugateGradientMultiDeviceCG.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
<GenerateRelocatableDeviceCode>true</GenerateRelocatableDeviceCode>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="conjugateGradientMultiDeviceCG.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,299 @@
################################################################################
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
endif
ifeq ($(TARGET_OS),qnx)
CCFLAGS += -DWIN_INTERFACE_CUSTOM
LDFLAGS += -lsocket
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
# This sample is not supported on ARMv7
ifeq ($(TARGET_ARCH),armv7l)
$(info >>> WARNING - cuSolverDn_LinearSolver is not supported on ARMv7 - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(TARGET_OS),linux)
ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\"
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../Common
LIBRARIES :=
################################################################################
LIBRARIES += -lcusolver -lcublas -lcusparse
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: cuSolverDn_LinearSolver
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
cuSolverDn_LinearSolver.o:cuSolverDn_LinearSolver.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
mmio.c.o:mmio.c
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
mmio_wrapper.o:mmio_wrapper.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
cuSolverDn_LinearSolver: cuSolverDn_LinearSolver.o mmio.c.o mmio_wrapper.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./cuSolverDn_LinearSolver
clean:
rm -f cuSolverDn_LinearSolver cuSolverDn_LinearSolver.o mmio.c.o mmio_wrapper.o
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cuSolverDn_LinearSolver
clobber: clean

View File

@ -0,0 +1,95 @@
# cuSolverDn_LinearSolver - cuSolverDn Linear Solver
## Description
A CUDA Sample that demonstrates cuSolverDN's LU, QR and Cholesky factorization.
## Key Concepts
Linear Algebra, CUSOLVER Library
## Supported SM Architectures
[SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
Linux, Windows, MacOSX
## Supported CPU Architecture
x86_64, ppc64le, aarch64
## CUDA APIs involved
## Dependencies needed to build/run
[CUSOLVER](../../README.md#cusolver), [CUBLAS](../../README.md#cublas), [CUSPARSE](../../README.md#cusparse)
## Prerequisites
Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
### Mac
The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
```
$ make SMS="A B ..."
```
* **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=clang
```
## References (for more details)

View File

@ -0,0 +1,584 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Test three linear solvers, including Cholesky, LU and QR.
* The user has to prepare a sparse matrix of "matrix market format" (with
* extension .mtx). For example, the user can download matrices in Florida
* Sparse Matrix Collection.
* (http://www.cise.ufl.edu/research/sparse/matrices/)
*
* The user needs to choose a solver by switch -R<solver> and
* to provide the path of the matrix by switch -F<file>, then
* the program solves
* A*x = b where b = ones(m,1)
* and reports relative error
* |b-A*x|/(|A|*|x|)
*
* The elapsed time is also reported so the user can compare efficiency of
* different solvers.
*
* How to use
* ./cuSolverDn_LinearSolver // Default: cholesky
* ./cuSolverDn_LinearSolver -R=chol -filefile> // cholesky factorization
* ./cuSolverDn_LinearSolver -R=lu -file<file> // LU with partial
* pivoting
* ./cuSolverDn_LinearSolver -R=qr -file<file> // QR factorization
*
* Remark: the absolute error on solution x is meaningless without knowing
* condition number of A. The relative error on residual should be close to
* machine zero, i.e. 1.e-15.
*/
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "helper_cuda.h"
#include "helper_cusolver.h"
template <typename T_ELEM>
int loadMMSparseMatrix(char *filename, char elem_type, bool csrFormat, int *m,
int *n, int *nnz, T_ELEM **aVal, int **aRowInd,
int **aColInd, int extendSymMatrix);
void UsageDN(void) {
printf("<options>\n");
printf("-h : display this help\n");
printf("-R=<name> : choose a linear solver\n");
printf(" chol (cholesky factorization), this is default\n");
printf(" qr (QR factorization)\n");
printf(" lu (LU factorization)\n");
printf("-lda=<int> : leading dimension of A , m by default\n");
printf("-file=<filename>: filename containing a matrix in MM format\n");
printf("-device=<device_id> : <device_id> if want to run on specific GPU\n");
exit(0);
}
/*
* solve A*x = b by Cholesky factorization
*
*/
int linearSolverCHOL(cusolverDnHandle_t handle, int n, const double *Acopy,
int lda, const double *b, double *x) {
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int h_info = 0;
double start, stop;
double time_solve;
cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
checkCudaErrors(cusolverDnDpotrf_bufferSize(handle, uplo, n, (double *)Acopy,
lda, &bufferSize));
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n));
// prepare a copy of A because potrf will overwrite A with L
checkCudaErrors(
cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
start = second();
checkCudaErrors(
cusolverDnDpotrf(handle, uplo, n, A, lda, buffer, bufferSize, info));
checkCudaErrors(
cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if (0 != h_info) {
fprintf(stderr, "Error: Cholesky factorization failed\n");
}
checkCudaErrors(
cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cusolverDnDpotrs(handle, uplo, n, 1, A, lda, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf(stdout, "timing: cholesky = %10.6f sec\n", time_solve);
if (info) {
checkCudaErrors(cudaFree(info));
}
if (buffer) {
checkCudaErrors(cudaFree(buffer));
}
if (A) {
checkCudaErrors(cudaFree(A));
}
return 0;
}
/*
* solve A*x = b by LU with partial pivoting
*
*/
int linearSolverLU(cusolverDnHandle_t handle, int n, const double *Acopy,
int lda, const double *b, double *x) {
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int *ipiv = NULL; // pivoting sequence
int h_info = 0;
double start, stop;
double time_solve;
checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n, (double *)Acopy,
lda, &bufferSize));
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n));
checkCudaErrors(cudaMalloc(&ipiv, sizeof(int) * n));
// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(
cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
start = second();
checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
checkCudaErrors(
cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if (0 != h_info) {
fprintf(stderr, "Error: LU factorization failed\n");
}
checkCudaErrors(
cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice));
checkCudaErrors(
cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf(stdout, "timing: LU = %10.6f sec\n", time_solve);
if (info) {
checkCudaErrors(cudaFree(info));
}
if (buffer) {
checkCudaErrors(cudaFree(buffer));
}
if (A) {
checkCudaErrors(cudaFree(A));
}
if (ipiv) {
checkCudaErrors(cudaFree(ipiv));
}
return 0;
}
/*
* solve A*x = b by QR
*
*/
int linearSolverQR(cusolverDnHandle_t handle, int n, const double *Acopy,
int lda, const double *b, double *x) {
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
int bufferSize = 0;
int bufferSize_geqrf = 0;
int bufferSize_ormqr = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
double *tau = NULL;
int h_info = 0;
double start, stop;
double time_solve;
const double one = 1.0;
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cusolverDnDgeqrf_bufferSize(handle, n, n, (double *)Acopy,
lda, &bufferSize_geqrf));
checkCudaErrors(cusolverDnDormqr_bufferSize(handle, CUBLAS_SIDE_LEFT,
CUBLAS_OP_T, n, 1, n, A, lda,
NULL, x, n, &bufferSize_ormqr));
printf("buffer_geqrf = %d, buffer_ormqr = %d \n", bufferSize_geqrf,
bufferSize_ormqr);
bufferSize = (bufferSize_geqrf > bufferSize_ormqr) ? bufferSize_geqrf
: bufferSize_ormqr;
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n));
checkCudaErrors(cudaMalloc((void **)&tau, sizeof(double) * n));
// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(
cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
start = second();
// compute QR factorization
checkCudaErrors(
cusolverDnDgeqrf(handle, n, n, A, lda, tau, buffer, bufferSize, info));
checkCudaErrors(
cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if (0 != h_info) {
fprintf(stderr, "Error: LU factorization failed\n");
}
checkCudaErrors(
cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice));
// compute Q^T*b
checkCudaErrors(cusolverDnDormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, n, 1,
n, A, lda, tau, x, n, buffer, bufferSize,
info));
// x = R \ Q^T*b
checkCudaErrors(cublasDtrsm(cublasHandle, CUBLAS_SIDE_LEFT,
CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT, n, 1, &one, A, lda, x, n));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf(stdout, "timing: QR = %10.6f sec\n", time_solve);
if (cublasHandle) {
checkCudaErrors(cublasDestroy(cublasHandle));
}
if (info) {
checkCudaErrors(cudaFree(info));
}
if (buffer) {
checkCudaErrors(cudaFree(buffer));
}
if (A) {
checkCudaErrors(cudaFree(A));
}
if (tau) {
checkCudaErrors(cudaFree(tau));
}
return 0;
}
void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) {
memset(&opts, 0, sizeof(opts));
if (checkCmdLineFlag(argc, (const char **)argv, "-h")) {
UsageDN();
}
if (checkCmdLineFlag(argc, (const char **)argv, "R")) {
char *solverType = NULL;
getCmdLineArgumentString(argc, (const char **)argv, "R", &solverType);
if (solverType) {
if ((STRCASECMP(solverType, "chol") != 0) &&
(STRCASECMP(solverType, "lu") != 0) &&
(STRCASECMP(solverType, "qr") != 0)) {
printf("\nIncorrect argument passed to -R option\n");
UsageDN();
} else {
opts.testFunc = solverType;
}
}
}
if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
char *fileName = 0;
getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName);
if (fileName) {
opts.sparse_mat_filename = fileName;
} else {
printf("\nIncorrect filename passed to -file \n ");
UsageDN();
}
}
if (checkCmdLineFlag(argc, (const char **)argv, "lda")) {
opts.lda = getCmdLineArgumentInt(argc, (const char **)argv, "lda");
}
}
int main(int argc, char *argv[]) {
struct testOpts opts;
cusolverDnHandle_t handle = NULL;
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
cudaStream_t stream = NULL;
int rowsA = 0; // number of rows of A
int colsA = 0; // number of columns of A
int nnzA = 0; // number of nonzeros of A
int baseA = 0; // base index in CSR format
int lda = 0; // leading dimension in dense matrix
// CSR(A) from I/O
int *h_csrRowPtrA = NULL;
int *h_csrColIndA = NULL;
double *h_csrValA = NULL;
double *h_A = NULL; // dense matrix from CSR(A)
double *h_x = NULL; // a copy of d_x
double *h_b = NULL; // b = ones(m,1)
double *h_r = NULL; // r = b - A*x, a copy of d_r
double *d_A = NULL; // a copy of h_A
double *d_x = NULL; // x = A \ b
double *d_b = NULL; // a copy of h_b
double *d_r = NULL; // r = b - A*x
// the constants are used in residual evaluation, r = b - A*x
const double minus_one = -1.0;
const double one = 1.0;
double x_inf = 0.0;
double r_inf = 0.0;
double A_inf = 0.0;
int errors = 0;
parseCommandLineArguments(argc, argv, opts);
if (NULL == opts.testFunc) {
opts.testFunc = "chol"; // By default running Cholesky as NO solver
// selected with -R option.
}
findCudaDevice(argc, (const char **)argv);
printf("step 1: read matrix market format\n");
if (opts.sparse_mat_filename == NULL) {
opts.sparse_mat_filename = sdkFindFilePath("gr_900_900_crg.mtx", argv[0]);
if (opts.sparse_mat_filename != NULL)
printf("Using default input file [%s]\n", opts.sparse_mat_filename);
else
printf("Could not find gr_900_900_crg.mtx\n");
} else {
printf("Using input file [%s]\n", opts.sparse_mat_filename);
}
if (opts.sparse_mat_filename == NULL) {
fprintf(stderr, "Error: input matrix is not provided\n");
return EXIT_FAILURE;
}
if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true, &rowsA,
&colsA, &nnzA, &h_csrValA, &h_csrRowPtrA,
&h_csrColIndA, true)) {
exit(EXIT_FAILURE);
}
baseA = h_csrRowPtrA[0]; // baseA = {0,1}
printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA,
nnzA, baseA);
if (rowsA != colsA) {
fprintf(stderr, "Error: only support square matrix\n");
exit(EXIT_FAILURE);
}
printf("step 2: convert CSR(A) to dense matrix\n");
lda = opts.lda ? opts.lda : rowsA;
if (lda < rowsA) {
fprintf(stderr, "Error: lda must be greater or equal to dimension of A\n");
exit(EXIT_FAILURE);
}
h_A = (double *)malloc(sizeof(double) * lda * colsA);
h_x = (double *)malloc(sizeof(double) * colsA);
h_b = (double *)malloc(sizeof(double) * rowsA);
h_r = (double *)malloc(sizeof(double) * rowsA);
assert(NULL != h_A);
assert(NULL != h_x);
assert(NULL != h_b);
assert(NULL != h_r);
memset(h_A, 0, sizeof(double) * lda * colsA);
for (int row = 0; row < rowsA; row++) {
const int start = h_csrRowPtrA[row] - baseA;
const int end = h_csrRowPtrA[row + 1] - baseA;
for (int colidx = start; colidx < end; colidx++) {
const int col = h_csrColIndA[colidx] - baseA;
const double Areg = h_csrValA[colidx];
h_A[row + col * lda] = Areg;
}
}
printf("step 3: set right hand side vector (b) to 1\n");
for (int row = 0; row < rowsA; row++) {
h_b[row] = 1.0;
}
// verify if A is symmetric or not.
if (0 == strcmp(opts.testFunc, "chol")) {
int issym = 1;
for (int j = 0; j < colsA; j++) {
for (int i = j; i < rowsA; i++) {
double Aij = h_A[i + j * lda];
double Aji = h_A[j + i * lda];
if (Aij != Aji) {
issym = 0;
break;
}
}
}
if (!issym) {
printf("Error: A has no symmetric pattern, please use LU or QR \n");
exit(EXIT_FAILURE);
}
}
checkCudaErrors(cusolverDnCreate(&handle));
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors(cusolverDnSetStream(handle, stream));
checkCudaErrors(cublasSetStream(cublasHandle, stream));
checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double) * lda * colsA));
checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA));
checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA));
checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA));
printf("step 4: prepare data on device\n");
checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double) * lda * colsA,
cudaMemcpyHostToDevice));
checkCudaErrors(
cudaMemcpy(d_b, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice));
printf("step 5: solve A*x = b \n");
// d_A and d_b are read-only
if (0 == strcmp(opts.testFunc, "chol")) {
linearSolverCHOL(handle, rowsA, d_A, lda, d_b, d_x);
} else if (0 == strcmp(opts.testFunc, "lu")) {
linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
} else if (0 == strcmp(opts.testFunc, "qr")) {
linearSolverQR(handle, rowsA, d_A, lda, d_b, d_x);
} else {
fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc);
exit(EXIT_FAILURE);
}
printf("step 6: evaluate residual\n");
checkCudaErrors(
cudaMemcpy(d_r, d_b, sizeof(double) * rowsA, cudaMemcpyDeviceToDevice));
// r = b - A*x
checkCudaErrors(cublasDgemm_v2(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, rowsA,
1, colsA, &minus_one, d_A, lda, d_x, rowsA,
&one, d_r, rowsA));
checkCudaErrors(
cudaMemcpy(h_x, d_x, sizeof(double) * colsA, cudaMemcpyDeviceToHost));
checkCudaErrors(
cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost));
x_inf = vec_norminf(colsA, h_x);
r_inf = vec_norminf(rowsA, h_r);
A_inf = mat_norminf(rowsA, colsA, h_A, lda);
printf("|b - A*x| = %E \n", r_inf);
printf("|A| = %E \n", A_inf);
printf("|x| = %E \n", x_inf);
printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf));
if (handle) {
checkCudaErrors(cusolverDnDestroy(handle));
}
if (cublasHandle) {
checkCudaErrors(cublasDestroy(cublasHandle));
}
if (stream) {
checkCudaErrors(cudaStreamDestroy(stream));
}
if (h_csrValA) {
free(h_csrValA);
}
if (h_csrRowPtrA) {
free(h_csrRowPtrA);
}
if (h_csrColIndA) {
free(h_csrColIndA);
}
if (h_A) {
free(h_A);
}
if (h_x) {
free(h_x);
}
if (h_b) {
free(h_b);
}
if (h_r) {
free(h_r);
}
if (d_A) {
checkCudaErrors(cudaFree(d_A));
}
if (d_x) {
checkCudaErrors(cudaFree(d_x));
}
if (d_b) {
checkCudaErrors(cudaFree(d_b));
}
if (d_r) {
checkCudaErrors(cudaFree(d_r));
}
return 0;
}

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2012
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuSolverDn_LinearSolver", "cuSolverDn_LinearSolver_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,109 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>cuSolverDn_LinearSolver_vs2012</RootNamespace>
<ProjectName>cuSolverDn_LinearSolver</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v110</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cusolver.lib;cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/cuSolverDn_LinearSolver.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration></CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="cuSolverDn_LinearSolver.cpp" />
<ClCompile Include="mmio.c" />
<ClCompile Include="mmio_wrapper.cpp" />
<ClInclude Include="mmio.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 13.00
# Visual Studio 2013
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuSolverDn_LinearSolver", "cuSolverDn_LinearSolver_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,109 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>cuSolverDn_LinearSolver_vs2013</RootNamespace>
<ProjectName>cuSolverDn_LinearSolver</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cusolver.lib;cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/cuSolverDn_LinearSolver.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration></CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="cuSolverDn_LinearSolver.cpp" />
<ClCompile Include="mmio.c" />
<ClCompile Include="mmio_wrapper.cpp" />
<ClInclude Include="mmio.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 14.00
# Visual Studio 2015
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuSolverDn_LinearSolver", "cuSolverDn_LinearSolver_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,109 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>cuSolverDn_LinearSolver_vs2015</RootNamespace>
<ProjectName>cuSolverDn_LinearSolver</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v140</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cusolver.lib;cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/cuSolverDn_LinearSolver.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration></CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="cuSolverDn_LinearSolver.cpp" />
<ClCompile Include="mmio.c" />
<ClCompile Include="mmio_wrapper.cpp" />
<ClInclude Include="mmio.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2017
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuSolverDn_LinearSolver", "cuSolverDn_LinearSolver_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,114 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>cuSolverDn_LinearSolver_vs2017</RootNamespace>
<ProjectName>cuSolverDn_LinearSolver</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cusolver.lib;cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/cuSolverDn_LinearSolver.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration></CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="cuSolverDn_LinearSolver.cpp" />
<ClCompile Include="mmio.c" />
<ClCompile Include="mmio_wrapper.cpp" />
<ClInclude Include="mmio.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuSolverDn_LinearSolver", "cuSolverDn_LinearSolver_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,110 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>cuSolverDn_LinearSolver_vs2019</RootNamespace>
<ProjectName>cuSolverDn_LinearSolver</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cusolver.lib;cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/cuSolverDn_LinearSolver.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration></CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
<Include>./;../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="cuSolverDn_LinearSolver.cpp" />
<ClCompile Include="mmio.c" />
<ClCompile Include="mmio_wrapper.cpp" />
<ClInclude Include="mmio.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
</ImportGroup>
</Project>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,521 @@
/*
* Matrix Market I/O library for ANSI C
*
* See http://math.nist.gov/MatrixMarket for details.
*
*
*/
/* avoid Windows warnings (for example: strcpy, fscanf, etc.) */
#if defined(_WIN32)
#define _CRT_SECURE_NO_WARNINGS
#endif
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include "mmio.h"
int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
double **val_, int **I_, int **J_)
{
FILE *f;
MM_typecode matcode;
int M, N, nz;
int i;
double *val;
int *I, *J;
if ((f = fopen(fname, "r")) == NULL)
return -1;
if (mm_read_banner(f, &matcode) != 0)
{
printf("mm_read_unsymetric: Could not process Matrix Market banner ");
printf(" in file [%s]\n", fname);
return -1;
}
if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
mm_is_sparse(matcode)))
{
fprintf(stderr, "Sorry, this application does not support ");
fprintf(stderr, "Market Market type: [%s]\n",
mm_typecode_to_str(matcode));
return -1;
}
/* find out size of sparse matrix: M, N, nz .... */
if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
{
fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
return -1;
}
*M_ = M;
*N_ = N;
*nz_ = nz;
/* reseve memory for matrices */
I = (int *) malloc(nz * sizeof(int));
J = (int *) malloc(nz * sizeof(int));
val = (double *) malloc(nz * sizeof(double));
*val_ = val;
*I_ = I;
*J_ = J;
/* NOTE: when reading in doubles, ANSI C requires the use of the "l" */
/* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
/* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */
for (i=0; i<nz; i++)
{
if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]) != 3) {
return -1;
}
I[i]--; /* adjust from 1-based to 0-based */
J[i]--;
}
fclose(f);
return 0;
}
int mm_is_valid(MM_typecode matcode)
{
if (!mm_is_matrix(matcode)) return 0;
if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) ||
mm_is_skew(matcode))) return 0;
return 1;
}
int mm_read_banner(FILE *f, MM_typecode *matcode)
{
char line[MM_MAX_LINE_LENGTH];
char banner[MM_MAX_TOKEN_LENGTH];
char mtx[MM_MAX_TOKEN_LENGTH];
char crd[MM_MAX_TOKEN_LENGTH];
char data_type[MM_MAX_TOKEN_LENGTH];
char storage_scheme[MM_MAX_TOKEN_LENGTH];
char *p;
mm_clear_typecode(matcode);
if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
return MM_PREMATURE_EOF;
if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type,
storage_scheme) != 5)
return MM_PREMATURE_EOF;
for (p=mtx; *p!='\0'; *p=tolower(*p),p++); /* convert to lower case */
for (p=crd; *p!='\0'; *p=tolower(*p),p++);
for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
/* check for banner */
if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
return MM_NO_HEADER;
/* first field should be "mtx" */
if (strcmp(mtx, MM_MTX_STR) != 0)
return MM_UNSUPPORTED_TYPE;
mm_set_matrix(matcode);
/* second field describes whether this is a sparse matrix (in coordinate
storgae) or a dense array */
if (strcmp(crd, MM_SPARSE_STR) == 0)
mm_set_sparse(matcode);
else
if (strcmp(crd, MM_DENSE_STR) == 0)
mm_set_dense(matcode);
else
return MM_UNSUPPORTED_TYPE;
/* third field */
if (strcmp(data_type, MM_REAL_STR) == 0)
mm_set_real(matcode);
else
if (strcmp(data_type, MM_COMPLEX_STR) == 0)
mm_set_complex(matcode);
else
if (strcmp(data_type, MM_PATTERN_STR) == 0)
mm_set_pattern(matcode);
else
if (strcmp(data_type, MM_INT_STR) == 0)
mm_set_integer(matcode);
else
return MM_UNSUPPORTED_TYPE;
/* fourth field */
if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
mm_set_general(matcode);
else
if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
mm_set_symmetric(matcode);
else
if (strcmp(storage_scheme, MM_HERM_STR) == 0)
mm_set_hermitian(matcode);
else
if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
mm_set_skew(matcode);
else
return MM_UNSUPPORTED_TYPE;
return 0;
}
int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
{
if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
return MM_COULD_NOT_WRITE_FILE;
else
return 0;
}
int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
{
char line[MM_MAX_LINE_LENGTH];
int num_items_read;
/* set return null parameter values, in case we exit with errors */
*M = *N = *nz = 0;
/* now continue scanning until you reach the end-of-comments */
do
{
if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL)
return MM_PREMATURE_EOF;
}while (line[0] == '%');
/* line[] is either blank or has M,N, nz */
if (sscanf(line, "%d %d %d", M, N, nz) == 3)
return 0;
else
do
{
num_items_read = fscanf(f, "%d %d %d", M, N, nz);
if (num_items_read == EOF) return MM_PREMATURE_EOF;
}
while (num_items_read != 3);
return 0;
}
int mm_read_mtx_array_size(FILE *f, int *M, int *N)
{
char line[MM_MAX_LINE_LENGTH];
int num_items_read;
/* set return null parameter values, in case we exit with errors */
*M = *N = 0;
/* now continue scanning until you reach the end-of-comments */
do
{
if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL)
return MM_PREMATURE_EOF;
}while (line[0] == '%');
/* line[] is either blank or has M,N, nz */
if (sscanf(line, "%d %d", M, N) == 2)
return 0;
else /* we have a blank line */
do
{
num_items_read = fscanf(f, "%d %d", M, N);
if (num_items_read == EOF) return MM_PREMATURE_EOF;
}
while (num_items_read != 2);
return 0;
}
int mm_write_mtx_array_size(FILE *f, int M, int N)
{
if (fprintf(f, "%d %d\n", M, N) != 2)
return MM_COULD_NOT_WRITE_FILE;
else
return 0;
}
/*-------------------------------------------------------------------------*/
/******************************************************************/
/* use when I[], J[], and val[]J, and val[] are already allocated */
/******************************************************************/
int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
double val[], MM_typecode matcode)
{
int i;
if (mm_is_complex(matcode))
{
for (i=0; i<nz; i++)
if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
!= 4) return MM_PREMATURE_EOF;
}
else if (mm_is_real(matcode) || mm_is_integer(matcode))
{
for (i=0; i<nz; i++)
{
if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
!= 3) return MM_PREMATURE_EOF;
}
}
else if (mm_is_pattern(matcode))
{
for (i=0; i<nz; i++)
if (fscanf(f, "%d %d", &I[i], &J[i])
!= 2) return MM_PREMATURE_EOF;
}
else
return MM_UNSUPPORTED_TYPE;
return 0;
}
int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
double *real, double *imag, MM_typecode matcode)
{
if (mm_is_complex(matcode))
{
if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
!= 4) return MM_PREMATURE_EOF;
}
else if (mm_is_real(matcode) || mm_is_integer(matcode))
{
if (fscanf(f, "%d %d %lg\n", I, J, real)
!= 3) return MM_PREMATURE_EOF;
}
else if (mm_is_pattern(matcode))
{
if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
}
else
return MM_UNSUPPORTED_TYPE;
return 0;
}
/************************************************************************
mm_read_mtx_crd() fills M, N, nz, array of values, and return
type code, e.g. 'MCRS'
if matrix is complex, values[] is of size 2*nz,
(nz pairs of real/imaginary values)
************************************************************************/
int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J,
double **val, MM_typecode *matcode)
{
int ret_code;
FILE *f;
if (strcmp(fname, "stdin") == 0) f=stdin;
else
if ((f = fopen(fname, "r")) == NULL)
return MM_COULD_NOT_READ_FILE;
if ((ret_code = mm_read_banner(f, matcode)) != 0)
return ret_code;
if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) &&
mm_is_matrix(*matcode)))
return MM_UNSUPPORTED_TYPE;
if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
return ret_code;
*I = (int *) malloc(*nz * sizeof(int));
*J = (int *) malloc(*nz * sizeof(int));
*val = NULL;
if (mm_is_complex(*matcode))
{
*val = (double *) malloc(*nz * 2 * sizeof(double));
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
*matcode);
if (ret_code != 0) return ret_code;
}
else if (mm_is_real(*matcode) || mm_is_integer(*matcode))
{
*val = (double *) malloc(*nz * sizeof(double));
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
*matcode);
if (ret_code != 0) return ret_code;
}
else if (mm_is_pattern(*matcode))
{
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
*matcode);
if (ret_code != 0) return ret_code;
}
if (f != stdin) fclose(f);
return 0;
}
int mm_write_banner(FILE *f, MM_typecode matcode)
{
char *str = mm_typecode_to_str(matcode);
int ret_code;
ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
free(str);
if (ret_code !=2 )
return MM_COULD_NOT_WRITE_FILE;
else
return 0;
}
int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
double val[], MM_typecode matcode)
{
FILE *f;
int i;
if (strcmp(fname, "stdout") == 0)
f = stdout;
else
if ((f = fopen(fname, "w")) == NULL)
return MM_COULD_NOT_WRITE_FILE;
/* print banner followed by typecode */
fprintf(f, "%s ", MatrixMarketBanner);
fprintf(f, "%s\n", mm_typecode_to_str(matcode));
/* print matrix sizes and nonzeros */
fprintf(f, "%d %d %d\n", M, N, nz);
/* print values */
if (mm_is_pattern(matcode))
for (i=0; i<nz; i++)
fprintf(f, "%d %d\n", I[i], J[i]);
else
if (mm_is_integer(matcode))
for (i=0; i<nz; i++)
fprintf(f, "%d %d %d\n", I[i], J[i], (int)val[i]);
else
if (mm_is_real(matcode))
for (i=0; i<nz; i++)
fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
else
if (mm_is_complex(matcode))
for (i=0; i<nz; i++)
fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i],
val[2*i+1]);
else
{
if (f != stdout) fclose(f);
return MM_UNSUPPORTED_TYPE;
}
if (f !=stdout) fclose(f);
return 0;
}
/**
* Create a new copy of a string s. mm_strdup() is a common routine, but
* not part of ANSI C, so it is included here. Used by mm_typecode_to_str().
*
*/
static char *mm_strdup(const char *s)
{
size_t len = strlen(s);
char *s2 = (char *) malloc((len+1)*sizeof(char));
return strcpy(s2, s);
}
char *mm_typecode_to_str(MM_typecode matcode)
{
char buffer[MM_MAX_LINE_LENGTH];
char *types[4];
//char *mm_strdup(const char *);
//int error =0;
/* check for MTX type */
if (mm_is_matrix(matcode))
types[0] = MM_MTX_STR;
else
return NULL; // error=1;
/* check for CRD or ARR matrix */
if (mm_is_sparse(matcode))
types[1] = MM_SPARSE_STR;
else
if (mm_is_dense(matcode))
types[1] = MM_DENSE_STR;
else
return NULL;
/* check for element data type */
if (mm_is_real(matcode))
types[2] = MM_REAL_STR;
else
if (mm_is_complex(matcode))
types[2] = MM_COMPLEX_STR;
else
if (mm_is_pattern(matcode))
types[2] = MM_PATTERN_STR;
else
if (mm_is_integer(matcode))
types[2] = MM_INT_STR;
else
return NULL;
/* check for symmetry type */
if (mm_is_general(matcode))
types[3] = MM_GENERAL_STR;
else
if (mm_is_symmetric(matcode))
types[3] = MM_SYMM_STR;
else
if (mm_is_hermitian(matcode))
types[3] = MM_HERM_STR;
else
if (mm_is_skew(matcode))
types[3] = MM_SKEW_STR;
else
return NULL;
sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
return mm_strdup(buffer);
}

View File

@ -0,0 +1,141 @@
/*
* Matrix Market I/O library for ANSI C
*
* See http://math.nist.gov/MatrixMarket for details.
*
*
*/
#ifndef MM_IO_H
#define MM_IO_H
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
#define MM_MAX_LINE_LENGTH 1025
#define MatrixMarketBanner "%%MatrixMarket"
#define MM_MAX_TOKEN_LENGTH 64
typedef char MM_typecode[4];
char *mm_typecode_to_str(MM_typecode matcode);
int mm_read_banner(FILE *f, MM_typecode *matcode);
int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
int mm_read_mtx_array_size(FILE *f, int *M, int *N);
int mm_write_banner(FILE *f, MM_typecode matcode);
int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
int mm_write_mtx_array_size(FILE *f, int M, int N);
/********************* MM_typecode query fucntions ***************************/
#define mm_is_matrix(typecode) ((typecode)[0]=='M')
#define mm_is_sparse(typecode) ((typecode)[1]=='C')
#define mm_is_coordinate(typecode)((typecode)[1]=='C')
#define mm_is_dense(typecode) ((typecode)[1]=='A')
#define mm_is_array(typecode) ((typecode)[1]=='A')
#define mm_is_complex(typecode) ((typecode)[2]=='C')
#define mm_is_real(typecode) ((typecode)[2]=='R')
#define mm_is_pattern(typecode) ((typecode)[2]=='P')
#define mm_is_integer(typecode) ((typecode)[2]=='I')
#define mm_is_symmetric(typecode)((typecode)[3]=='S')
#define mm_is_general(typecode) ((typecode)[3]=='G')
#define mm_is_skew(typecode) ((typecode)[3]=='K')
#define mm_is_hermitian(typecode)((typecode)[3]=='H')
int mm_is_valid(MM_typecode matcode); /* too complex for a macro */
/********************* MM_typecode modify fucntions ***************************/
#define mm_set_matrix(typecode) ((*typecode)[0]='M')
#define mm_set_coordinate(typecode) ((*typecode)[1]='C')
#define mm_set_array(typecode) ((*typecode)[1]='A')
#define mm_set_dense(typecode) mm_set_array(typecode)
#define mm_set_sparse(typecode) mm_set_coordinate(typecode)
#define mm_set_complex(typecode)((*typecode)[2]='C')
#define mm_set_real(typecode) ((*typecode)[2]='R')
#define mm_set_pattern(typecode)((*typecode)[2]='P')
#define mm_set_integer(typecode)((*typecode)[2]='I')
#define mm_set_symmetric(typecode)((*typecode)[3]='S')
#define mm_set_general(typecode)((*typecode)[3]='G')
#define mm_set_skew(typecode) ((*typecode)[3]='K')
#define mm_set_hermitian(typecode)((*typecode)[3]='H')
#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
(*typecode)[2]=' ',(*typecode)[3]='G')
#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
/********************* Matrix Market error codes ***************************/
#define MM_COULD_NOT_READ_FILE 11
#define MM_PREMATURE_EOF 12
#define MM_NOT_MTX 13
#define MM_NO_HEADER 14
#define MM_UNSUPPORTED_TYPE 15
#define MM_LINE_TOO_LONG 16
#define MM_COULD_NOT_WRITE_FILE 17
/******************** Matrix Market internal definitions ********************
MM_matrix_typecode: 4-character sequence
ojbect sparse/ data storage
dense type scheme
string position: [0] [1] [2] [3]
Matrix typecode: M(atrix) C(oord) R(eal) G(eneral)
A(array) C(omplex) H(ermitian)
P(attern) S(ymmetric)
I(nteger) K(kew)
***********************************************************************/
#define MM_MTX_STR "matrix"
#define MM_ARRAY_STR "array"
#define MM_DENSE_STR "array"
#define MM_COORDINATE_STR "coordinate"
#define MM_SPARSE_STR "coordinate"
#define MM_COMPLEX_STR "complex"
#define MM_REAL_STR "real"
#define MM_INT_STR "integer"
#define MM_GENERAL_STR "general"
#define MM_SYMM_STR "symmetric"
#define MM_HERM_STR "hermitian"
#define MM_SKEW_STR "skew-symmetric"
#define MM_PATTERN_STR "pattern"
/* high level routines */
int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J,
double **val, MM_typecode *matcode);
int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
double val[], MM_typecode matcode);
int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
double val[], MM_typecode matcode);
int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
MM_typecode matcode);
int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
double **val_, int **I_, int **J_);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif

View File

@ -0,0 +1,529 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mmio.h"
#include <cusolverDn.h>
/* avoid Windows warnings (for example: strcpy, fscanf, etc.) */
#if defined(_WIN32)
#define _CRT_SECURE_NO_WARNINGS
#endif
/* various __inline__ __device__ function to initialize a T_ELEM */
template <typename T_ELEM> __inline__ T_ELEM cuGet (int );
template <> __inline__ float cuGet<float >(int x)
{
return float(x);
}
template <> __inline__ double cuGet<double>(int x)
{
return double(x);
}
template <> __inline__ cuComplex cuGet<cuComplex>(int x)
{
return (make_cuComplex( float(x), 0.0f ));
}
template <> __inline__ cuDoubleComplex cuGet<cuDoubleComplex>(int x)
{
return (make_cuDoubleComplex( double(x), 0.0 ));
}
template <typename T_ELEM> __inline__ T_ELEM cuGet (int , int );
template <> __inline__ float cuGet<float >(int x, int y)
{
return float(x);
}
template <> __inline__ double cuGet<double>(int x, int y)
{
return double(x);
}
template <> __inline__ cuComplex cuGet<cuComplex>(int x, int y)
{
return make_cuComplex( float(x), float(y) );
}
template <> __inline__ cuDoubleComplex cuGet<cuDoubleComplex>(int x, int y)
{
return (make_cuDoubleComplex( double(x), double(y) ));
}
template <typename T_ELEM> __inline__ T_ELEM cuGet (float );
template <> __inline__ float cuGet<float >(float x)
{
return float(x);
}
template <> __inline__ double cuGet<double>(float x)
{
return double(x);
}
template <> __inline__ cuComplex cuGet<cuComplex>(float x)
{
return (make_cuComplex( float(x), 0.0f ));
}
template <> __inline__ cuDoubleComplex cuGet<cuDoubleComplex>(float x)
{
return (make_cuDoubleComplex( double(x), 0.0 ));
}
template <typename T_ELEM> __inline__ T_ELEM cuGet (float, float );
template <> __inline__ float cuGet<float >(float x, float y)
{
return float(x);
}
template <> __inline__ double cuGet<double>(float x, float y)
{
return double(x);
}
template <> __inline__ cuComplex cuGet<cuComplex>(float x, float y)
{
return (make_cuComplex( float(x), float(y) ));
}
template <> __inline__ cuDoubleComplex cuGet<cuDoubleComplex>(float x, float y)
{
return (make_cuDoubleComplex( double(x), double(y) ));
}
template <typename T_ELEM> __inline__ T_ELEM cuGet (double );
template <> __inline__ float cuGet<float >(double x)
{
return float(x);
}
template <> __inline__ double cuGet<double>(double x)
{
return double(x);
}
template <> __inline__ cuComplex cuGet<cuComplex>(double x)
{
return (make_cuComplex( float(x), 0.0f ));
}
template <> __inline__ cuDoubleComplex cuGet<cuDoubleComplex>(double x)
{
return (make_cuDoubleComplex( double(x), 0.0 ));
}
template <typename T_ELEM> __inline__ T_ELEM cuGet (double, double );
template <> __inline__ float cuGet<float >(double x, double y)
{
return float(x);
}
template <> __inline__ double cuGet<double>(double x, double y)
{
return double(x);
}
template <> __inline__ cuComplex cuGet<cuComplex>(double x, double y)
{
return (make_cuComplex( float(x), float(y) ));
}
template <> __inline__ cuDoubleComplex cuGet<cuDoubleComplex>(double x, double y)
{
return (make_cuDoubleComplex( double(x), double(y) ));
}
static void compress_index(
const int *Ind,
int nnz,
int m,
int *Ptr,
int base)
{
int i;
/* initialize everything to zero */
for(i=0; i<m+1; i++){
Ptr[i]=0;
}
/* count elements in every row */
Ptr[0]=base;
for(i=0; i<nnz; i++){
Ptr[Ind[i]+(1-base)]++;
}
/* add all the values */
for(i=0; i<m; i++){
Ptr[i+1]+=Ptr[i];
}
}
struct cooFormat {
int i ;
int j ;
int p ; // permutation
};
int cmp_cooFormat_csr( struct cooFormat *s, struct cooFormat *t)
{
if ( s->i < t->i ){
return -1 ;
}
else if ( s->i > t->i ){
return 1 ;
}
else{
return s->j - t->j ;
}
}
int cmp_cooFormat_csc( struct cooFormat *s, struct cooFormat *t)
{
if ( s->j < t->j ){
return -1 ;
}
else if ( s->j > t->j ){
return 1 ;
}
else{
return s->i - t->i ;
}
}
typedef int (*FUNPTR) (const void*, const void*) ;
typedef int (*FUNPTR2) ( struct cooFormat *s, struct cooFormat *t) ;
static FUNPTR2 fptr_array[2] = {
cmp_cooFormat_csr,
cmp_cooFormat_csc,
};
static int verify_pattern(
int m,
int nnz,
int *csrRowPtr,
int *csrColInd)
{
int i, col, start, end, base_index;
int error_found = 0;
if (nnz != (csrRowPtr[m] - csrRowPtr[0])){
fprintf(stderr, "Error (nnz check failed): (csrRowPtr[%d]=%d - csrRowPtr[%d]=%d) != (nnz=%d)\n", 0, csrRowPtr[0], m, csrRowPtr[m], nnz);
error_found = 1;
}
base_index = csrRowPtr[0];
if ((0 != base_index) && (1 != base_index)){
fprintf(stderr, "Error (base index check failed): base index = %d\n", base_index);
error_found = 1;
}
for (i=0; (!error_found) && (i<m); i++){
start = csrRowPtr[i ] - base_index;
end = csrRowPtr[i+1] - base_index;
if (start > end){
fprintf(stderr, "Error (corrupted row): csrRowPtr[%d] (=%d) > csrRowPtr[%d] (=%d)\n", i, start+base_index, i+1, end+base_index);
error_found = 1;
}
for (col=start; col<end; col++){
if (csrColInd[col] < base_index){
fprintf(stderr, "Error (column vs. base index check failed): csrColInd[%d] < %d\n", col, base_index);
error_found = 1;
}
if ((col < (end-1)) && (csrColInd[col] >= csrColInd[col+1])){
fprintf(stderr, "Error (sorting of the column indecis check failed): (csrColInd[%d]=%d) >= (csrColInd[%d]=%d)\n", col, csrColInd[col], col+1, csrColInd[col+1]);
error_found = 1;
}
}
}
return error_found ;
}
template <typename T_ELEM>
int loadMMSparseMatrix(
char *filename,
char elem_type,
bool csrFormat,
int *m,
int *n,
int *nnz,
T_ELEM **aVal,
int **aRowInd,
int **aColInd,
int extendSymMatrix)
{
MM_typecode matcode;
double *tempVal;
int *tempRowInd,*tempColInd;
double *tval;
int *trow,*tcol;
int *csrRowPtr, *cscColPtr;
int i,j,error,base,count;
struct cooFormat *work;
/* read the matrix */
error = mm_read_mtx_crd(filename, m, n, nnz, &trow, &tcol, &tval, &matcode);
if (error) {
fprintf(stderr, "!!!! can not open file: '%s'\n", filename);
return 1;
}
/* start error checking */
if (mm_is_complex(matcode) && ((elem_type != 'z') && (elem_type != 'c'))) {
fprintf(stderr, "!!!! complex matrix requires type 'z' or 'c'\n");
return 1;
}
if (mm_is_dense(matcode) || mm_is_array(matcode) || mm_is_pattern(matcode) /*|| mm_is_integer(matcode)*/){
fprintf(stderr, "!!!! dense, array, pattern and integer matrices are not supported\n");
return 1;
}
/* if necessary symmetrize the pattern (transform from triangular to full) */
if ((extendSymMatrix) && (mm_is_symmetric(matcode) || mm_is_hermitian(matcode) || mm_is_skew(matcode))){
//count number of non-diagonal elements
count=0;
for(i=0; i<(*nnz); i++){
if (trow[i] != tcol[i]){
count++;
}
}
//allocate space for the symmetrized matrix
tempRowInd = (int *)malloc((*nnz + count) * sizeof(int));
tempColInd = (int *)malloc((*nnz + count) * sizeof(int));
if (mm_is_real(matcode) || mm_is_integer(matcode)){
tempVal = (double *)malloc((*nnz + count) * sizeof(double));
}
else{
tempVal = (double *)malloc(2 * (*nnz + count) * sizeof(double));
}
//copy the elements regular and transposed locations
for(j=0, i=0; i<(*nnz); i++){
tempRowInd[j]=trow[i];
tempColInd[j]=tcol[i];
if (mm_is_real(matcode) || mm_is_integer(matcode)){
tempVal[j]=tval[i];
}
else{
tempVal[2*j] =tval[2*i];
tempVal[2*j+1]=tval[2*i+1];
}
j++;
if (trow[i] != tcol[i]){
tempRowInd[j]=tcol[i];
tempColInd[j]=trow[i];
if (mm_is_real(matcode) || mm_is_integer(matcode)){
if (mm_is_skew(matcode)){
tempVal[j]=-tval[i];
}
else{
tempVal[j]= tval[i];
}
}
else{
if(mm_is_hermitian(matcode)){
tempVal[2*j] = tval[2*i];
tempVal[2*j+1]=-tval[2*i+1];
}
else{
tempVal[2*j] = tval[2*i];
tempVal[2*j+1]= tval[2*i+1];
}
}
j++;
}
}
(*nnz)+=count;
//free temporary storage
free(trow);
free(tcol);
free(tval);
}
else{
tempRowInd=trow;
tempColInd=tcol;
tempVal =tval;
}
// life time of (trow, tcol, tval) is over.
// please use COO format (tempRowInd, tempColInd, tempVal)
// use qsort to sort COO format
work = (struct cooFormat *)malloc(sizeof(struct cooFormat)*(*nnz));
if (NULL == work){
fprintf(stderr, "!!!! allocation error, malloc failed\n");
return 1;
}
for(i=0; i<(*nnz); i++){
work[i].i = tempRowInd[i];
work[i].j = tempColInd[i];
work[i].p = i; // permutation is identity
}
if (csrFormat){
/* create row-major ordering of indices (sorted by row and within each row by column) */
qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[0] );
}else{
/* create column-major ordering of indices (sorted by column and within each column by row) */
qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[1] );
}
// (tempRowInd, tempColInd) is sorted either by row-major or by col-major
for(i=0; i<(*nnz); i++){
tempRowInd[i] = work[i].i;
tempColInd[i] = work[i].j;
}
// setup base
// check if there is any row/col 0, if so base-0
// check if there is any row/col equal to matrix dimension m/n, if so base-1
int base0 = 0;
int base1 = 0;
for(i=0; i<(*nnz); i++){
const int row = tempRowInd[i];
const int col = tempColInd[i];
if ( (0 == row) || (0 == col) ){
base0 = 1;
}
if ( (*m == row) || (*n == col) ){
base1 = 1;
}
}
if ( base0 && base1 ){
printf("Error: input matrix is base-0 and base-1 \n");
return 1;
}
base = 0;
if (base1){
base = 1;
}
/* compress the appropriate indices */
if (csrFormat){
/* CSR format (assuming row-major format) */
csrRowPtr = (int *)malloc(((*m)+1) * sizeof(csrRowPtr[0]));
if (!csrRowPtr) return 1;
compress_index(tempRowInd, *nnz, *m, csrRowPtr, base);
*aRowInd = csrRowPtr;
*aColInd = (int *)malloc((*nnz) * sizeof(int));
}
else {
/* CSC format (assuming column-major format) */
cscColPtr = (int *)malloc(((*n)+1) * sizeof(cscColPtr[0]));
if (!cscColPtr) return 1;
compress_index(tempColInd, *nnz, *n, cscColPtr, base);
*aColInd = cscColPtr;
*aRowInd = (int *)malloc((*nnz) * sizeof(int));
}
/* transfrom the matrix values of type double into one of the cusparse library types */
*aVal = (T_ELEM *)malloc((*nnz) * sizeof(T_ELEM));
for (i=0; i<(*nnz); i++) {
if (csrFormat){
(*aColInd)[i] = tempColInd[i];
}
else{
(*aRowInd)[i] = tempRowInd[i];
}
if (mm_is_real(matcode) || mm_is_integer(matcode)){
(*aVal)[i] = cuGet<T_ELEM>( tempVal[ work[i].p ] );
}
else{
(*aVal)[i] = cuGet<T_ELEM>(tempVal[2*work[i].p], tempVal[2*work[i].p+1]);
}
}
/* check for corruption */
int error_found;
if (csrFormat){
error_found = verify_pattern(*m, *nnz, *aRowInd, *aColInd);
}else{
error_found = verify_pattern(*n, *nnz, *aColInd, *aRowInd);
}
if (error_found){
fprintf(stderr, "!!!! verify_pattern failed\n");
return 1;
}
/* cleanup and exit */
free(work);
free(tempVal);
free(tempColInd);
free(tempRowInd);
return 0;
}
/* specific instantiation */
template int loadMMSparseMatrix<float>(
char *filename,
char elem_type,
bool csrFormat,
int *m,
int *n,
int *nnz,
float **aVal,
int **aRowInd,
int **aColInd,
int extendSymMatrix);
template int loadMMSparseMatrix<double>(
char *filename,
char elem_type,
bool csrFormat,
int *m,
int *n,
int *nnz,
double **aVal,
int **aRowInd,
int **aColInd,
int extendSymMatrix);
template int loadMMSparseMatrix<cuComplex>(
char *filename,
char elem_type,
bool csrFormat,
int *m,
int *n,
int *nnz,
cuComplex **aVal,
int **aRowInd,
int **aColInd,
int extendSymMatrix);
template int loadMMSparseMatrix<cuDoubleComplex>(
char *filename,
char elem_type,
bool csrFormat,
int *m,
int *n,
int *nnz,
cuDoubleComplex **aVal,
int **aRowInd,
int **aColInd,
int extendSymMatrix);

Some files were not shown because too many files have changed in this diff Show More