Add and update samples with CUDA 10.1 Update 1 support

2026-06-04 00:06:52 +08:00 · 2019-04-10 20:12:09 +05:30 · 2019-04-10 20:12:09 +05:30 · 337815dbee
commit 337815dbee
parent 1abc294982
210 changed files with 46770 additions and 1057 deletions
--- a/Common/drvapi_error_string.h
+++ b/Common/drvapi_error_string.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/exception.h
+++ b/Common/exception.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_cuda.h
+++ b/Common/helper_cuda.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_cuda_drvapi.h
+++ b/Common/helper_cuda_drvapi.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_cusolver.h
+++ b/Common/helper_cusolver.h
@ -0,0 +1,166 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HELPER_CUSOLVER
+#define HELPER_CUSOLVER
+
+#include <ctype.h>
+#include <cuda_runtime.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "cusparse.h"
+
+#define SWITCH_CHAR '-'
+
+struct testOpts {
+  char *sparse_mat_filename;  // by switch -F<filename>
+  const char *testFunc;       // by switch -R<name>
+  const char *reorder;        // by switch -P<name>
+  int lda;                    // by switch -lda<int>
+};
+
+double vec_norminf(int n, const double *x) {
+  double norminf = 0;
+  for (int j = 0; j < n; j++) {
+    double x_abs = fabs(x[j]);
+    norminf = (norminf > x_abs) ? norminf : x_abs;
+  }
+  return norminf;
+}
+
+/*
+ * |A| = max { |A|*ones(m,1) }
+ */
+double mat_norminf(int m, int n, const double *A, int lda) {
+  double norminf = 0;
+  for (int i = 0; i < m; i++) {
+    double sum = 0.0;
+    for (int j = 0; j < n; j++) {
+      double A_abs = fabs(A[i + j * lda]);
+      sum += A_abs;
+    }
+    norminf = (norminf > sum) ? norminf : sum;
+  }
+  return norminf;
+}
+
+/*
+ * |A| = max { |A|*ones(m,1) }
+ */
+double csr_mat_norminf(int m, int n, int nnzA, const cusparseMatDescr_t descrA,
+                       const double *csrValA, const int *csrRowPtrA,
+                       const int *csrColIndA) {
+  const int baseA =
+      (CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0;
+
+  double norminf = 0;
+  for (int i = 0; i < m; i++) {
+    double sum = 0.0;
+    const int start = csrRowPtrA[i] - baseA;
+    const int end = csrRowPtrA[i + 1] - baseA;
+    for (int colidx = start; colidx < end; colidx++) {
+      // const int j = csrColIndA[colidx] - baseA;
+      double A_abs = fabs(csrValA[colidx]);
+      sum += A_abs;
+    }
+    norminf = (norminf > sum) ? norminf : sum;
+  }
+  return norminf;
+}
+
+void display_matrix(int m, int n, int nnzA, const cusparseMatDescr_t descrA,
+                    const double *csrValA, const int *csrRowPtrA,
+                    const int *csrColIndA) {
+  const int baseA =
+      (CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0;
+
+  printf("m = %d, n = %d, nnz = %d, matlab base-1\n", m, n, nnzA);
+
+  for (int row = 0; row < m; row++) {
+    const int start = csrRowPtrA[row] - baseA;
+    const int end = csrRowPtrA[row + 1] - baseA;
+    for (int colidx = start; colidx < end; colidx++) {
+      const int col = csrColIndA[colidx] - baseA;
+      double Areg = csrValA[colidx];
+      printf("A(%d, %d) = %20.16E\n", row + 1, col + 1, Areg);
+    }
+  }
+}
+
+#if defined(_WIN32)
+#if !defined(WIN32_LEAN_AND_MEAN)
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+double second(void) {
+  LARGE_INTEGER t;
+  static double oofreq;
+  static int checkedForHighResTimer;
+  static BOOL hasHighResTimer;
+
+  if (!checkedForHighResTimer) {
+    hasHighResTimer = QueryPerformanceFrequency(&t);
+    oofreq = 1.0 / (double)t.QuadPart;
+    checkedForHighResTimer = 1;
+  }
+  if (hasHighResTimer) {
+    QueryPerformanceCounter(&t);
+    return (double)t.QuadPart * oofreq;
+  } else {
+    return (double)GetTickCount() / 1000.0;
+  }
+}
+
+#elif defined(__linux__) || defined(__QNX__)
+#include <stddef.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+double second(void) {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+
+#elif defined(__APPLE__)
+#include <stddef.h>
+#include <sys/resource.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+double second(void) {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+#else
+#error unsupported platform
+#endif
+
+#endif
--- a/Common/helper_functions.h
+++ b/Common/helper_functions.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_image.h
+++ b/Common/helper_image.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_multiprocess.cpp
+++ b/Common/helper_multiprocess.cpp
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_multiprocess.h
+++ b/Common/helper_multiprocess.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_string.h
+++ b/Common/helper_string.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_timer.h
+++ b/Common/helper_timer.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/nvrtc_helper.h
+++ b/Common/nvrtc_helper.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 ###############################################################################
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
--- a/README.md
+++ b/README.md
@ -6,6 +6,12 @@ Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This ve

 This section describes the release notes for the CUDA Samples on GitHub only.

+### CUDA 10.1 Update 1
+*  Added `NV12toBGRandResize`. Demonstrates how to convert and resize NV12 frames to BGR planars frames using CUDA in batch.
+*  Added `EGLStream_CUDA_Interop`. Demonstrates data exchange between CUDA and EGL Streams.
+*  Added `cuSolverDn_LinearSolver`. Demonstrates cuSolverDN's LU, QR and Cholesky factorization.
+*  Added support of Visual Studio 2019 to all samples supported on [Windows](#windows-1).
+
 ### CUDA 10.1
 *  Added `immaTensorCoreGemm`. Demonstrates integer GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API for integers employing the Tensor Cores.
 *  Added `simpleIPC`. Demonstrates Inter Process Communication with one process per GPU for computation.
@ -128,29 +134,32 @@ The samples makefiles can take advantage of certain options:
 ### Samples by OS

 #### Linux
-**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[shfl_scan](./Samples/shfl_scan)** |
+**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** |
 ---|---|---|---|
-**[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[nvJPEG](./Samples/nvJPEG)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** |
-**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** |
-**[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** |
-**[bandwidthTest](./Samples/bandwidthTest)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** |
-**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** |
+**[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[nvJPEG](./Samples/nvJPEG)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** |
+**[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** |
+**[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** |
+**[reduction](./Samples/reduction)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** |
+**[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** |
+**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** |

 #### Windows
 **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[shfl_scan](./Samples/shfl_scan)** |
 ---|---|---|---|
 **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[simpleD3D12](./Samples/simpleD3D12)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** |
-**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** |
-**[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** |
-**[bandwidthTest](./Samples/bandwidthTest)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** |
-**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** |
+**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** |
+**[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** |
+**[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** |
+**[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** |
+**[matrixMul](./Samples/matrixMul)** |

 #### Mac OSX
 **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** |
 ---|---|---|---|
 **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** |
-**[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** | **[bandwidthTest](./Samples/bandwidthTest)** |
-**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** |
+**[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[reduction](./Samples/reduction)** |
+**[bandwidthTest](./Samples/bandwidthTest)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** |
+**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** |

 ## Dependencies

--- a/Samples/EGLStream_CUDA_Interop/Makefile
+++ b/Samples/EGLStream_CUDA_Interop/Makefile
@ -0,0 +1,364 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+endif
+
+ifeq ($(TARGET_OS),qnx)
+    CCFLAGS += -DWIN_INTERFACE_CUSTOM
+    LDFLAGS += -lsocket
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - EGLStream_CUDA_Interop is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - EGLStream_CUDA_Interop is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on android
+ifeq ($(TARGET_OS),android)
+  $(info >>> WARNING - EGLStream_CUDA_Interop is not supported on android - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Makefile include to help find EGL Libraries
+include ./findegl.mk
+
+# EGL specific libraries
+ifneq ($(TARGET_OS),darwin)
+ LIBRARIES += -lEGL
+endif
+
+ifeq ($(TARGET_OS),darwin)
+  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
+else
+  ifeq ($(TARGET_ARCH),x86_64)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
+  endif
+
+  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
+  ifeq ("$(CUDALIB)","")
+    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
+    SAMPLE_ENABLED := 0
+  else
+    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
+    LIBRARIES += -L$(CUDALIB) -lcuda
+  endif
+endif
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: EGLStream_CUDA_Interop
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+cuda_consumer.o:cuda_consumer.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+cuda_producer.o:cuda_producer.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+eglstrm_common.o:eglstrm_common.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+main.o:main.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+EGLStream_CUDA_Interop: cuda_consumer.o cuda_producer.o eglstrm_common.o main.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./EGLStream_CUDA_Interop
+
+clean:
+	rm -f EGLStream_CUDA_Interop cuda_consumer.o cuda_producer.o eglstrm_common.o main.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/EGLStream_CUDA_Interop
+
+clobber: clean
--- a/Samples/EGLStream_CUDA_Interop/NsightEclipse.xml
+++ b/Samples/EGLStream_CUDA_Interop/NsightEclipse.xml
@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>EGLStream_CUDA_Interop</name>
+  <cuda_api_list>
+    <driver>cuDeviceGet</driver>
+    <driver>cuDeviceGetAttribute</driver>
+    <driver>cuDeviceComputeCapability</driver>
+    <driver>cuDeviceGetCount</driver>
+    <driver>cuDeviceGetName</driver>
+    <driver>cuGraphicsResourceGetMappedEglFrame</driver>
+    <driver>cuEGLStreamConsumerAcquireFrame</driver>
+    <driver>cuEGLStreamConsumerReleaseFrame</driver>
+    <driver>cuEGLStreamProducerPresentFrame</driver>
+    <driver>cuCtxCreate</driver>
+    <driver>cuMemAlloc</driver>
+    <driver>cuMemFree</driver>
+    <driver>cuMemcpy3D</driver>
+    <driver>cuStreamCreate</driver>
+    <driver>cuCtxPushCurrent</driver>
+    <driver>cuCtxPopCurrent</driver>
+    <driver>cuCtxDestroy</driver>
+  </cuda_api_list>
+  <description><![CDATA[Demonstrates data exchange between CUDA and EGL Streams.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">EGLStreams Interop</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>EGL Streams</keyword>
+  </keywords>
+  <libraries>
+    <library os="linux">cuda</library>
+    <library framework="true" os="macosx">CUDA</library>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>main.cpp</primary_file>
+  <required_dependencies>
+    <dependency>EGL</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>2:Graphics Interop</scope>
+  </scopes>
+  <sm-arch>sm30</sm-arch>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>aarch64</arch>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>EGLStream CUDA Interop</title>
+  <type>exe</type>
+</entry>
--- a/Samples/EGLStream_CUDA_Interop/README.md
+++ b/Samples/EGLStream_CUDA_Interop/README.md
@ -0,0 +1,64 @@
+# EGLStream_CUDA_Interop - EGLStream CUDA Interop
+
+## Description
+
+Demonstrates data exchange between CUDA and EGL Streams.
+
+## Key Concepts
+
+EGLStreams Interop
+
+## Supported SM Architectures
+
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux
+
+## Supported CPU Architecture
+
+x86_64, aarch64
+
+## CUDA APIs involved
+
+### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html)
+cuDeviceGet, cuDeviceGetAttribute, cuDeviceComputeCapability, cuDeviceGetCount, cuDeviceGetName, cuGraphicsResourceGetMappedEglFrame, cuEGLStreamConsumerAcquireFrame, cuEGLStreamConsumerReleaseFrame, cuEGLStreamProducerPresentFrame, cuCtxCreate, cuMemAlloc, cuMemFree, cuMemcpy3D, cuStreamCreate, cuCtxPushCurrent, cuCtxPopCurrent, cuCtxDestroy
+
+## Dependencies needed to build/run
+[EGL](../../README.md#egl)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
--- a/Samples/EGLStream_CUDA_Interop/cuda_consumer.cpp
+++ b/Samples/EGLStream_CUDA_Interop/cuda_consumer.cpp
@ -0,0 +1,318 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   Simple CUDA consumer rendering sample app
+//
+
+#include "cuda_consumer.h"
+#include <helper_cuda_drvapi.h>
+#include "eglstrm_common.h"
+
+#if defined(EXTENSION_LIST)
+EXTENSION_LIST(EXTLST_EXTERN)
+#endif
+
+int checkbuf(FILE *fp1, FILE *fp2);
+
+CUresult cudaConsumerTest(test_cuda_consumer_s *data, char *fileName) {
+  CUresult cuStatus = CUDA_SUCCESS;
+  CUarray cudaArr = NULL;
+  CUeglFrame cudaEgl;
+  CUgraphicsResource cudaResource;
+  unsigned int i;
+  int check_result;
+  FILE *pInFile1 = NULL, *pInFile2 = NULL, *file_p = NULL;
+  EGLint streamState = 0;
+
+  if (!data) {
+    printf("%s: Bad parameter\n", __func__);
+    goto done;
+  }
+
+  if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR,
+                         &streamState)) {
+    printf("Cuda consumer, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
+  }
+  if (streamState == EGL_STREAM_STATE_DISCONNECTED_KHR) {
+    printf("CUDA Consumer: - EGL_STREAM_STATE_DISCONNECTED_KHR received\n");
+  }
+
+  if (streamState == EGL_STREAM_STATE_NEW_FRAME_AVAILABLE_KHR) {
+    cuStatus = cuEGLStreamConsumerAcquireFrame(&(data->cudaConn), &cudaResource,
+                                               NULL, 16000);
+
+    if (cuStatus == CUDA_SUCCESS) {
+      CUdeviceptr pDevPtr = 0;
+      int bufferSize;
+      unsigned char *pCudaCopyMem = NULL;
+      unsigned int copyWidthInBytes = 0, copyHeight = 0;
+
+      file_p = fopen(fileName, "wb+");
+      if (!file_p) {
+        printf("WriteFrame: file open failed %s\n", fileName);
+        cuStatus = CUDA_ERROR_UNKNOWN;
+        goto done;
+      }
+      cuStatus =
+          cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0);
+      if (cuStatus != CUDA_SUCCESS) {
+        printf("Cuda get resource failed with %d\n", cuStatus);
+        goto done;
+      }
+      cuStatus = cuCtxSynchronize();
+      if (cuStatus != CUDA_SUCCESS) {
+        printf("cuCtxSynchronize failed \n");
+        goto done;
+      }
+      if (!(cudaEgl.planeCount >= 1 && cudaEgl.planeCount <= 3)) {
+        printf("Plane count is invalid\nExiting\n");
+        goto done;
+      }
+
+      for (i = 0; i < cudaEgl.planeCount; i++) {
+        if (cudaEgl.frameType == CU_EGL_FRAME_TYPE_PITCH) {
+          pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[i];
+          if (cudaEgl.planeCount == 1) {
+            bufferSize = cudaEgl.pitch * cudaEgl.height;
+            copyWidthInBytes = cudaEgl.pitch;
+            copyHeight = data->height;
+          } else if (i == 1 && cudaEgl.planeCount == 2) {  // YUV 420
+                                                           // semi-planar
+            bufferSize = cudaEgl.pitch * cudaEgl.height / 2;
+            copyWidthInBytes = cudaEgl.pitch;
+            copyHeight = data->height / 2;
+          } else {
+            bufferSize = data->width * data->height;
+            copyWidthInBytes = data->width;
+            copyHeight = data->height;
+            if (i > 0) {
+              bufferSize >>= 2;
+              copyWidthInBytes >>= 1;
+              copyHeight >>= 1;
+            }
+          }
+        } else {
+          cudaArr = cudaEgl.frame.pArray[i];
+          if (cudaEgl.planeCount == 1) {
+            bufferSize = data->width * data->height * 4;
+            copyWidthInBytes = data->width * 4;
+            copyHeight = data->height;
+          } else if (i == 1 && cudaEgl.planeCount == 2) {  // YUV 420
+                                                           // semi-planar
+            bufferSize = data->width * data->height / 2;
+            copyWidthInBytes = data->width;
+            copyHeight = data->height / 2;
+          } else {
+            bufferSize = data->width * data->height;
+            copyWidthInBytes = data->width;
+            copyHeight = data->height;
+            if (i > 0) {
+              bufferSize >>= 2;
+              copyWidthInBytes >>= 1;
+              copyHeight >>= 1;
+            }
+          }
+        }
+        if (i == 0) {
+          pCudaCopyMem = (unsigned char *)malloc(bufferSize);
+          if (pCudaCopyMem == NULL) {
+            printf("pCudaCopyMem malloc failed\n");
+            goto done;
+          }
+        }
+        memset(pCudaCopyMem, 0, bufferSize);
+        if (data->pitchLinearOutput) {
+          cuStatus = cuMemcpyDtoH(pCudaCopyMem, pDevPtr, bufferSize);
+          if (cuStatus != CUDA_SUCCESS) {
+            printf(
+                "cuda_consumer: pitch linear Memcpy failed, bufferSize =%d\n",
+                bufferSize);
+            goto done;
+          }
+          cuStatus = cuCtxSynchronize();
+          if (cuStatus != CUDA_SUCCESS) {
+            printf("cuda_consumer: cuCtxSynchronize failed after memcpy \n");
+            goto done;
+          }
+        } else {
+          CUDA_MEMCPY3D cpdesc;
+          memset(&cpdesc, 0, sizeof(cpdesc));
+          cpdesc.srcXInBytes = cpdesc.srcY = cpdesc.srcZ = cpdesc.srcLOD = 0;
+          cpdesc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+          cpdesc.srcArray = cudaArr;
+          cpdesc.dstXInBytes = cpdesc.dstY = cpdesc.dstZ = cpdesc.dstLOD = 0;
+          cpdesc.dstMemoryType = CU_MEMORYTYPE_HOST;
+          cpdesc.dstHost = (void *)pCudaCopyMem;
+          cpdesc.WidthInBytes = copyWidthInBytes;  // data->width * 4;
+          cpdesc.Height = copyHeight;              // data->height;
+          cpdesc.Depth = 1;
+
+          cuStatus = cuMemcpy3D(&cpdesc);
+          if (cuStatus != CUDA_SUCCESS) {
+            printf(
+                "Cuda consumer: cuMemCpy3D failed,  copyWidthInBytes=%d, "
+                "copyHight=%d\n",
+                copyWidthInBytes, copyHeight);
+          }
+          cuStatus = cuCtxSynchronize();
+          if (cuStatus != CUDA_SUCCESS) {
+            printf("cuCtxSynchronize failed after memcpy \n");
+          }
+        }
+        if (cuStatus == CUDA_SUCCESS) {
+          if (fwrite(pCudaCopyMem, bufferSize, 1, file_p) != 1) {
+            printf("Cuda consumer: output file write failed\n");
+            cuStatus = CUDA_ERROR_UNKNOWN;
+            goto done;
+          }
+        }
+      }
+      pInFile1 = fopen(data->fileName1, "rb");
+      if (!pInFile1) {
+        printf("Failed to open file :%s\n", data->fileName1);
+        goto done;
+      }
+      pInFile2 = fopen(data->fileName2, "rb");
+      if (!pInFile2) {
+        printf("Failed to open file :%s\n", data->fileName2);
+        goto done;
+      }
+      rewind(file_p);
+      check_result = checkbuf(file_p, pInFile1);
+      if (check_result == -1) {
+        rewind(file_p);
+        check_result = checkbuf(file_p, pInFile2);
+        if (check_result == -1) {
+          printf("Frame received does not match any valid image: FAILED\n");
+        } else {
+          printf("Frame check Passed\n");
+        }
+      } else {
+        printf("Frame check Passed\n");
+      }
+      if (pCudaCopyMem) {
+        free(pCudaCopyMem);
+        pCudaCopyMem = NULL;
+      }
+      cuStatus =
+          cuEGLStreamConsumerReleaseFrame(&data->cudaConn, cudaResource, NULL);
+      if (cuStatus != CUDA_SUCCESS) {
+        printf("cuEGLStreamConsumerReleaseFrame failed with cuStatus = %d\n",
+               cuStatus);
+        goto done;
+      }
+    } else {
+      printf("cuda AcquireFrame FAILED with  cuStatus=%d\n", cuStatus);
+      goto done;
+    }
+  }
+
+done:
+  if (file_p) {
+    fclose(file_p);
+    file_p = NULL;
+  }
+  if (pInFile1) {
+    fclose(pInFile1);
+    pInFile1 = NULL;
+  }
+  if (pInFile1) {
+    fclose(pInFile2);
+    pInFile2 = NULL;
+  }
+  return cuStatus;
+}
+
+int checkbuf(FILE *fp1, FILE *fp2) {
+  int match = 0;
+  int ch1, ch2;
+  if (fp1 == NULL) {
+    printf("Invalid file pointer for first file\n");
+    return -1;
+  } else if (fp2 == NULL) {
+    printf("Invalid file pointer for second file\n");
+    return -1;
+  } else {
+    ch1 = getc(fp1);
+    ch2 = getc(fp2);
+    while ((ch1 != EOF) && (ch2 != EOF) && (ch1 == ch2)) {
+      ch1 = getc(fp1);
+      ch2 = getc(fp2);
+    }
+    if (ch1 == ch2) {
+      match = 1;
+    } else if (ch1 != ch2) {
+      match = -1;
+    }
+  }
+  return match;
+}
+
+CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer,
+                                  CUdevice device) {
+  CUresult status = CUDA_SUCCESS;
+  if (CUDA_SUCCESS != (status = cuInit(0))) {
+    printf("Failed to initialize CUDA\n");
+    return status;
+  }
+
+  int major = 0, minor = 0;
+  char deviceName[256];
+  checkCudaErrors(cuDeviceGetAttribute(
+      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
+  checkCudaErrors(cuDeviceGetName(deviceName, 256, device));
+  printf(
+      "CUDA Consumer on GPU Device %d: \"%s\" with compute capability "
+      "%d.%d\n\n",
+      device, deviceName, major, minor);
+
+  if (CUDA_SUCCESS !=
+      (status = cuCtxCreate(&cudaConsumer->context, 0, device))) {
+    printf("failed to create CUDA context\n");
+    return status;
+  }
+  checkCudaErrors(cuCtxPopCurrent(&cudaConsumer->context));
+  return status;
+}
+
+void cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args) {
+  cudaConsumer->pitchLinearOutput = args->pitchLinearOutput;
+  cudaConsumer->width = args->inputWidth;
+  cudaConsumer->height = args->inputHeight;
+  cudaConsumer->fileName1 = args->infile1;
+  cudaConsumer->fileName2 = args->infile2;
+
+  cudaConsumer->outFile1 = "cuda_out1.yuv";
+  cudaConsumer->outFile2 = "cuda_out2.yuv";
+}
+
+CUresult cuda_consumer_deinit(test_cuda_consumer_s *cudaConsumer) {
+  return cuEGLStreamConsumerDisconnect(&cudaConsumer->cudaConn);
+}
--- a/Samples/EGLStream_CUDA_Interop/cuda_consumer.h
+++ b/Samples/EGLStream_CUDA_Interop/cuda_consumer.h
@ -0,0 +1,62 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   CUDA consumer header file
+//
+
+#ifndef _CUDA_CONSUMER_H_
+#define _CUDA_CONSUMER_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "cudaEGL.h"
+#include "eglstrm_common.h"
+
+extern EGLStreamKHR eglStream;
+extern EGLDisplay g_display;
+
+typedef struct _test_cuda_consumer_s {
+  CUcontext context;
+  CUeglStreamConnection cudaConn;
+  bool pitchLinearOutput;
+  unsigned int width;
+  unsigned int height;
+  char *fileName1;
+  char *fileName2;
+  char *outFile1;
+  char *outFile2;
+  unsigned int frameCount;
+} test_cuda_consumer_s;
+
+void cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args);
+CUresult cuda_consumer_deinit(test_cuda_consumer_s *cudaConsumer);
+CUresult cudaConsumerTest(test_cuda_consumer_s *data, char *outFile);
+CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer,
+                                  CUdevice device);
+#endif
--- a/Samples/EGLStream_CUDA_Interop/cuda_f_1.yuv
+++ b/Samples/EGLStream_CUDA_Interop/cuda_f_1.yuv
--- a/Samples/EGLStream_CUDA_Interop/cuda_f_2.yuv
+++ b/Samples/EGLStream_CUDA_Interop/cuda_f_2.yuv
--- a/Samples/EGLStream_CUDA_Interop/cuda_producer.cpp
+++ b/Samples/EGLStream_CUDA_Interop/cuda_producer.cpp
@ -0,0 +1,381 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   Simple cuda EGL stream producer app
+//
+
+#include "cuda_producer.h"
+#include <helper_cuda_drvapi.h>
+#include "cudaEGL.h"
+#include "eglstrm_common.h"
+
+#if defined(EXTENSION_LIST)
+EXTENSION_LIST(EXTLST_EXTERN)
+#endif
+
+static CUresult cudaProducerReadYUVFrame(FILE *file, unsigned int frameNum,
+                                         unsigned int width,
+                                         unsigned int height,
+                                         unsigned char *pBuff) {
+  int bOrderUV = 0;
+  unsigned char *pYBuff, *pUBuff, *pVBuff, *pChroma;
+  unsigned int frameSize = (width * height * 3) / 2;
+  CUresult ret = CUDA_SUCCESS;
+  unsigned int i;
+
+  if (!pBuff || !file) return CUDA_ERROR_FILE_NOT_FOUND;
+
+  pYBuff = pBuff;
+
+  // YVU order in the buffer
+  pVBuff = pYBuff + width * height;
+  pUBuff = pVBuff + width * height / 4;
+
+  if (fseek(file, frameNum * frameSize, SEEK_SET)) {
+    printf("ReadYUVFrame: Error seeking file: %p\n", file);
+    ret = CUDA_ERROR_NOT_PERMITTED;
+    goto done;
+  }
+  // read Y U V separately
+  for (i = 0; i < height; i++) {
+    if (fread(pYBuff, width, 1, file) != 1) {
+      printf("ReadYUVFrame: Error reading file: %p\n", file);
+      ret = CUDA_ERROR_NOT_PERMITTED;
+      goto done;
+    }
+    pYBuff += width;
+  }
+
+  pChroma = bOrderUV ? pUBuff : pVBuff;
+  for (i = 0; i < height / 2; i++) {
+    if (fread(pChroma, width / 2, 1, file) != 1) {
+      printf("ReadYUVFrame: Error reading file: %p\n", file);
+      ret = CUDA_ERROR_NOT_PERMITTED;
+      goto done;
+    }
+    pChroma += width / 2;
+  }
+
+  pChroma = bOrderUV ? pVBuff : pUBuff;
+  for (i = 0; i < height / 2; i++) {
+    if (fread(pChroma, width / 2, 1, file) != 1) {
+      printf("ReadYUVFrame: Error reading file: %p\n", file);
+      ret = CUDA_ERROR_NOT_PERMITTED;
+      goto done;
+    }
+    pChroma += width / 2;
+  }
+done:
+  return ret;
+}
+
+static CUresult cudaProducerReadARGBFrame(FILE *file, unsigned int frameNum,
+                                          unsigned int width,
+                                          unsigned int height,
+                                          unsigned char *pBuff) {
+  unsigned int frameSize = width * height * 4;
+  CUresult ret = CUDA_SUCCESS;
+
+  if (!pBuff || !file) return CUDA_ERROR_FILE_NOT_FOUND;
+
+  if (fseek(file, frameNum * frameSize, SEEK_SET)) {
+    printf("ReadYUVFrame: Error seeking file: %p\n", file);
+    ret = CUDA_ERROR_NOT_PERMITTED;
+    goto done;
+  }
+
+  // read ARGB data
+  if (fread(pBuff, frameSize, 1, file) != 1) {
+    if (feof(file))
+      printf("ReadARGBFrame: file read to the end\n");
+    else
+      printf("ReadARGBFrame: Error reading file: %p\n", file);
+    ret = CUDA_ERROR_NOT_PERMITTED;
+    goto done;
+  }
+done:
+  return ret;
+}
+
+CUresult cudaProducerTest(test_cuda_producer_s *cudaProducer, char *file) {
+  int framenum = 0;
+  CUarray cudaArr[3] = {0};
+  CUdeviceptr cudaPtr[3] = {0, 0, 0};
+  unsigned int bufferSize;
+  CUresult cuStatus = CUDA_SUCCESS;
+  unsigned int i, surfNum, uvOffset[3] = {0};
+  unsigned int copyWidthInBytes[3] = {0, 0, 0}, copyHeight[3] = {0, 0, 0};
+  CUeglColorFormat eglColorFormat;
+  FILE *file_p;
+  CUeglFrame cudaEgl;
+  CUcontext oldContext;
+
+  file_p = fopen(file, "rb");
+  if (!file_p) {
+    printf("CudaProducer: Error opening file: %s\n", file);
+    goto done;
+  }
+
+  if (cudaProducer->pitchLinearOutput) {
+    if (cudaProducer->isARGB) {
+      cudaPtr[0] = cudaProducer->cudaPtrARGB[0];
+    } else {  // YUV case
+      for (i = 0; i < 3; i++) {
+        if (i == 0) {
+          bufferSize = cudaProducer->width * cudaProducer->height;
+        } else {
+          bufferSize = cudaProducer->width * cudaProducer->height / 4;
+        }
+
+        cudaPtr[i] = cudaProducer->cudaPtrYUV[i];
+      }
+    }
+  } else {
+    if (cudaProducer->isARGB) {
+      cudaArr[0] = cudaProducer->cudaArrARGB[0];
+    } else {
+      for (i = 0; i < 3; i++) {
+        cudaArr[i] = cudaProducer->cudaArrYUV[i];
+      }
+    }
+  }
+  uvOffset[0] = 0;
+  if (cudaProducer->isARGB) {
+    if (CUDA_SUCCESS !=
+        cudaProducerReadARGBFrame(file_p, framenum, cudaProducer->width,
+                                  cudaProducer->height, cudaProducer->pBuff)) {
+      printf("cuda producer, read ARGB frame failed\n");
+      goto done;
+    }
+    copyWidthInBytes[0] = cudaProducer->width * 4;
+    copyHeight[0] = cudaProducer->height;
+    surfNum = 1;
+    eglColorFormat = CU_EGL_COLOR_FORMAT_ARGB;
+  } else {
+    if (CUDA_SUCCESS !=
+        cudaProducerReadYUVFrame(file_p, framenum, cudaProducer->width,
+                                 cudaProducer->height, cudaProducer->pBuff)) {
+      printf("cuda producer, reading YUV frame failed\n");
+      goto done;
+    }
+    surfNum = 3;
+    eglColorFormat = CU_EGL_COLOR_FORMAT_YUV420_PLANAR;
+    copyWidthInBytes[0] = cudaProducer->width;
+    copyHeight[0] = cudaProducer->height;
+    copyWidthInBytes[1] = cudaProducer->width / 2;
+    copyHeight[1] = cudaProducer->height / 2;
+    copyWidthInBytes[2] = cudaProducer->width / 2;
+    copyHeight[2] = cudaProducer->height / 2;
+    uvOffset[1] = cudaProducer->width * cudaProducer->height;
+    uvOffset[2] =
+        uvOffset[1] + cudaProducer->width / 2 * cudaProducer->height / 2;
+  }
+  if (cudaProducer->pitchLinearOutput) {
+    for (i = 0; i < surfNum; i++) {
+      cuStatus =
+          cuMemcpy(cudaPtr[i], (CUdeviceptr)(cudaProducer->pBuff + uvOffset[i]),
+                   copyWidthInBytes[i] * copyHeight[i]);
+
+      if (cuStatus != CUDA_SUCCESS) {
+        printf("Cuda producer: cuMemCpy pitchlinear failed, cuStatus =%d\n",
+               cuStatus);
+        goto done;
+      }
+    }
+  } else {
+    // copy cudaProducer->pBuff to cudaArray
+    CUDA_MEMCPY3D cpdesc;
+    for (i = 0; i < surfNum; i++) {
+      memset(&cpdesc, 0, sizeof(cpdesc));
+      cpdesc.srcXInBytes = cpdesc.srcY = cpdesc.srcZ = cpdesc.srcLOD = 0;
+      cpdesc.srcMemoryType = CU_MEMORYTYPE_HOST;
+      cpdesc.srcHost = (void *)(cudaProducer->pBuff + uvOffset[i]);
+      cpdesc.dstXInBytes = cpdesc.dstY = cpdesc.dstZ = cpdesc.dstLOD = 0;
+      cpdesc.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+      cpdesc.dstArray = cudaArr[i];
+      cpdesc.WidthInBytes = copyWidthInBytes[i];
+      cpdesc.Height = copyHeight[i];
+      cpdesc.Depth = 1;
+      cuStatus = cuMemcpy3D(&cpdesc);
+      if (cuStatus != CUDA_SUCCESS) {
+        printf("Cuda producer: cuMemCpy failed, cuStatus =%d\n", cuStatus);
+        goto done;
+      }
+    }
+  }
+  for (i = 0; i < surfNum; i++) {
+    if (cudaProducer->pitchLinearOutput)
+      cudaEgl.frame.pPitch[i] = (void *)cudaPtr[i];
+    else
+      cudaEgl.frame.pArray[i] = cudaArr[i];
+  }
+  cudaEgl.width = copyWidthInBytes[0];
+  cudaEgl.depth = 1;
+  cudaEgl.height = copyHeight[0];
+  cudaEgl.pitch = cudaProducer->pitchLinearOutput ? cudaEgl.width : 0;
+  cudaEgl.frameType = cudaProducer->pitchLinearOutput ? CU_EGL_FRAME_TYPE_PITCH
+                                                      : CU_EGL_FRAME_TYPE_ARRAY;
+  cudaEgl.planeCount = surfNum;
+  cudaEgl.numChannels = (eglColorFormat == CU_EGL_COLOR_FORMAT_ARGB) ? 4 : 1;
+  cudaEgl.eglColorFormat = eglColorFormat;
+  cudaEgl.cuFormat = CU_AD_FORMAT_UNSIGNED_INT8;
+
+  cuStatus =
+      cuEGLStreamProducerPresentFrame(&cudaProducer->cudaConn, cudaEgl, NULL);
+  if (cuStatus != CUDA_SUCCESS) {
+    printf("cuda Producer present frame FAILED with custatus= %d\n", cuStatus);
+    goto done;
+  }
+
+done:
+  if (file_p) {
+    fclose(file_p);
+    file_p = NULL;
+  }
+
+  return cuStatus;
+}
+
+CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer,
+                                  CUdevice device) {
+  CUresult status = CUDA_SUCCESS;
+  if (CUDA_SUCCESS != (status = cuInit(0))) {
+    printf("Failed to initialize CUDA\n");
+    return status;
+  }
+
+  int major = 0, minor = 0;
+  char deviceName[256];
+  checkCudaErrors(cuDeviceGetAttribute(
+      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
+  checkCudaErrors(cuDeviceGetName(deviceName, 256, device));
+  printf(
+      "CUDA Producer on GPU Device %d: \"%s\" with compute capability "
+      "%d.%d\n\n",
+      device, deviceName, major, minor);
+
+  if (CUDA_SUCCESS !=
+      (status = cuCtxCreate(&cudaProducer->context, 0, device))) {
+    printf("failed to create CUDA context\n");
+    return status;
+  }
+
+  status = cuMemAlloc(&cudaProducer->cudaPtrARGB[0], (WIDTH * HEIGHT * 4));
+  if (status != CUDA_SUCCESS) {
+    printf("Create CUDA pointer failed, cuStatus=%d\n", status);
+    return status;
+  }
+
+  status = cuMemAlloc(&cudaProducer->cudaPtrYUV[0], (WIDTH * HEIGHT));
+  if (status != CUDA_SUCCESS) {
+    printf("Create CUDA pointer failed, cuStatus=%d\n", status);
+    return status;
+  }
+  status = cuMemAlloc(&cudaProducer->cudaPtrYUV[1], (WIDTH * HEIGHT) / 4);
+  if (status != CUDA_SUCCESS) {
+    printf("Create CUDA pointer failed, cuStatus=%d\n", status);
+    return status;
+  }
+  status = cuMemAlloc(&cudaProducer->cudaPtrYUV[2], (WIDTH * HEIGHT) / 4);
+  if (status != CUDA_SUCCESS) {
+    printf("Create CUDA pointer failed, cuStatus=%d\n", status);
+    return status;
+  }
+
+  CUDA_ARRAY3D_DESCRIPTOR desc = {0};
+
+  desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+  desc.Depth = 1;
+  desc.Flags = CUDA_ARRAY3D_SURFACE_LDST;
+  desc.NumChannels = 4;
+  desc.Width = WIDTH * 4;
+  desc.Height = HEIGHT;
+  status = cuArray3DCreate(&cudaProducer->cudaArrARGB[0], &desc);
+  if (status != CUDA_SUCCESS) {
+    printf("Create CUDA array failed, cuStatus=%d\n", status);
+    return status;
+  }
+
+  for (int i = 0; i < 3; i++) {
+    if (i == 0) {
+      desc.NumChannels = 1;
+      desc.Width = WIDTH;
+      desc.Height = HEIGHT;
+    } else {  // U/V surface as planar
+      desc.NumChannels = 1;
+      desc.Width = WIDTH / 2;
+      desc.Height = HEIGHT / 2;
+    }
+    status = cuArray3DCreate(&cudaProducer->cudaArrYUV[i], &desc);
+    if (status != CUDA_SUCCESS) {
+      printf("Create CUDA array failed, cuStatus=%d\n", status);
+      return status;
+    }
+  }
+
+  cudaProducer->pBuff = (unsigned char *)malloc((WIDTH * HEIGHT * 4));
+  if (!cudaProducer->pBuff) {
+    printf("CudaProducer: Failed to allocate image buffer\n");
+  }
+
+  checkCudaErrors(cuCtxPopCurrent(&cudaProducer->context));
+  return status;
+}
+
+void cudaProducerInit(test_cuda_producer_s *cudaProducer, EGLDisplay eglDisplay,
+                      EGLStreamKHR eglStream, TestArgs *args) {
+  cudaProducer->fileName1 = args->infile1;
+  cudaProducer->fileName2 = args->infile2;
+
+  cudaProducer->frameCount = 2;
+  cudaProducer->width = args->inputWidth;
+  cudaProducer->height = args->inputHeight;
+  cudaProducer->isARGB = args->isARGB;
+  cudaProducer->pitchLinearOutput = args->pitchLinearOutput;
+
+  // Set cudaProducer default parameters
+  cudaProducer->eglDisplay = eglDisplay;
+  cudaProducer->eglStream = eglStream;
+}
+
+CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer) {
+  if (cudaProducer->pBuff) free(cudaProducer->pBuff);
+
+  checkCudaErrors(cuMemFree(cudaProducer->cudaPtrARGB[0]));
+  checkCudaErrors(cuMemFree(cudaProducer->cudaPtrYUV[0]));
+  checkCudaErrors(cuMemFree(cudaProducer->cudaPtrYUV[1]));
+  checkCudaErrors(cuMemFree(cudaProducer->cudaPtrYUV[2]));
+  checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrARGB[0]));
+  checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrYUV[0]));
+  checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrYUV[1]));
+  checkCudaErrors(cuArrayDestroy(cudaProducer->cudaArrYUV[2]));
+
+  return cuEGLStreamProducerDisconnect(&cudaProducer->cudaConn);
+}
--- a/Samples/EGLStream_CUDA_Interop/cuda_producer.h
+++ b/Samples/EGLStream_CUDA_Interop/cuda_producer.h
@ -0,0 +1,68 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   Simple cuda producer header file
+//
+
+#ifndef _CUDA_PRODUCER_H_
+#define _CUDA_PRODUCER_H_
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include "cudaEGL.h"
+#include "eglstrm_common.h"
+
+extern EGLStreamKHR eglStream;
+extern EGLDisplay g_display;
+
+typedef struct _test_cuda_producer_s {
+  //  Stream params
+  char *fileName1;
+  char *fileName2;
+  unsigned char *pBuff;
+  int frameCount;
+  bool isARGB;
+  bool pitchLinearOutput;
+  unsigned int width;
+  unsigned int height;
+  CUcontext context;
+  CUeglStreamConnection cudaConn;
+  CUdeviceptr cudaPtrARGB[1];
+  CUdeviceptr cudaPtrYUV[3];
+  CUarray cudaArrARGB[1];
+  CUarray cudaArrYUV[3];
+  EGLStreamKHR eglStream;
+  EGLDisplay eglDisplay;
+} test_cuda_producer_s;
+
+void cudaProducerInit(test_cuda_producer_s *cudaProducer, EGLDisplay eglDisplay,
+                      EGLStreamKHR eglStream, TestArgs *args);
+CUresult cudaProducerTest(test_cuda_producer_s *parserArg, char *file);
+CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer);
+CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer,
+                                  CUdevice device);
+#endif
--- a/Samples/EGLStream_CUDA_Interop/cuda_yuv_f_1.yuv
+++ b/Samples/EGLStream_CUDA_Interop/cuda_yuv_f_1.yuv
--- a/Samples/EGLStream_CUDA_Interop/cuda_yuv_f_2.yuv
+++ b/Samples/EGLStream_CUDA_Interop/cuda_yuv_f_2.yuv
--- a/Samples/EGLStream_CUDA_Interop/eglstrm_common.cpp
+++ b/Samples/EGLStream_CUDA_Interop/eglstrm_common.cpp
@ -0,0 +1,139 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   Common egl stream functions
+//
+
+#include "eglstrm_common.h"
+
+EGLStreamKHR eglStream;
+EGLDisplay g_display;
+EGLAttrib cudaIndex;
+
+#if defined(EXTENSION_LIST)
+EXTENSION_LIST(EXTLST_DECL)
+typedef void (*extlst_fnptr_t)(void);
+static struct {
+  extlst_fnptr_t *fnptr;
+  char const *name;
+} extensionList[] = {EXTENSION_LIST(EXTLST_ENTRY)};
+
+int eglSetupExtensions(void) {
+  unsigned int i;
+
+  for (i = 0; i < (sizeof(extensionList) / sizeof(*extensionList)); i++) {
+    *extensionList[i].fnptr = eglGetProcAddress(extensionList[i].name);
+    if (*extensionList[i].fnptr == NULL) {
+      printf("Couldn't get address of %s()\n", extensionList[i].name);
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
+int EGLStreamInit(int *cuda_device) {
+  static const EGLint streamAttrMailboxMode[] = {EGL_SUPPORT_REUSE_NV,
+                                                 EGL_FALSE, EGL_NONE};
+  EGLBoolean eglStatus;
+#define MAX_EGL_DEVICES 4
+  EGLint numDevices = 0;
+  EGLDeviceEXT devices[MAX_EGL_DEVICES];
+  eglStatus = eglQueryDevicesEXT(MAX_EGL_DEVICES, devices, &numDevices);
+  if (eglStatus != EGL_TRUE) {
+    printf("Error querying EGL devices\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (numDevices == 0) {
+    printf("No EGL devices found.. Waiving\n");
+    eglStatus = EGL_FALSE;
+    exit(EXIT_WAIVED);
+  }
+
+  int egl_device_id = 0;
+  for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) {
+    eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id],
+                                        EGL_CUDA_DEVICE_NV, &cudaIndex);
+    if (eglStatus == EGL_TRUE) {
+      *cuda_device = cudaIndex;  // We select first EGL-CUDA Capable device.
+      printf("Found EGL-CUDA Capable device with CUDA Device id = %d\n",
+             (int)cudaIndex);
+      break;
+    }
+  }
+
+  if (egl_device_id >= numDevices) {
+    printf("No CUDA Capable EGL Device found.. Waiving execution\n");
+    exit(EXIT_WAIVED);
+  }
+
+  g_display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT,
+                                       (void *)devices[egl_device_id], NULL);
+  if (g_display == EGL_NO_DISPLAY) {
+    printf("Could not get EGL display from device. \n");
+    eglStatus = EGL_FALSE;
+    exit(EXIT_FAILURE);
+  }
+
+  eglStatus = eglInitialize(g_display, 0, 0);
+  if (!eglStatus) {
+    printf("EGL failed to initialize. \n");
+    eglStatus = EGL_FALSE;
+    exit(EXIT_FAILURE);
+  }
+
+  eglStream = eglCreateStreamKHR(g_display, streamAttrMailboxMode);
+  if (eglStream == EGL_NO_STREAM_KHR) {
+    printf("Could not create EGL stream.\n");
+    eglStatus = EGL_FALSE;
+    exit(EXIT_FAILURE);
+  }
+
+  printf("Created EGLStream %p\n", eglStream);
+
+  // Set stream attribute
+  if (!eglStreamAttribKHR(g_display, eglStream, EGL_CONSUMER_LATENCY_USEC_KHR,
+                          16000)) {
+    printf(
+        "Consumer: eglStreamAttribKHR EGL_CONSUMER_LATENCY_USEC_KHR failed\n");
+    return 0;
+  }
+  if (!eglStreamAttribKHR(g_display, eglStream,
+                          EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR, 16000)) {
+    printf(
+        "Consumer: eglStreamAttribKHR EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR "
+        "failed\n");
+    return 0;
+  }
+  printf("EGLStream initialized\n");
+  return 1;
+}
+
+void EGLStreamFini(void) { eglDestroyStreamKHR(g_display, eglStream); }
+#endif
--- a/Samples/EGLStream_CUDA_Interop/eglstrm_common.h
+++ b/Samples/EGLStream_CUDA_Interop/eglstrm_common.h
@ -0,0 +1,103 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   Common EGL stream functions header file
+//
+
+#ifndef _EGLSTRM_COMMON_H_
+#define _EGLSTRM_COMMON_H_
+
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "cuda.h"
+#include "cudaEGL.h"
+#include "helper_cuda_drvapi.h"
+
+#define EXTENSION_LIST(T)                                                \
+  T(PFNEGLCREATESTREAMKHRPROC, eglCreateStreamKHR)                       \
+  T(PFNEGLDESTROYSTREAMKHRPROC, eglDestroyStreamKHR)                     \
+  T(PFNEGLQUERYSTREAMKHRPROC, eglQueryStreamKHR)                         \
+  T(PFNEGLQUERYSTREAMU64KHRPROC, eglQueryStreamu64KHR)                   \
+  T(PFNEGLQUERYSTREAMTIMEKHRPROC, eglQueryStreamTimeKHR)                 \
+  T(PFNEGLSTREAMATTRIBKHRPROC, eglStreamAttribKHR)                       \
+  T(PFNEGLSTREAMCONSUMERACQUIREKHRPROC, eglStreamConsumerAcquireKHR)     \
+  T(PFNEGLSTREAMCONSUMERRELEASEKHRPROC, eglStreamConsumerReleaseKHR)     \
+  T(PFNEGLSTREAMCONSUMERGLTEXTUREEXTERNALKHRPROC,                        \
+    eglStreamConsumerGLTextureExternalKHR)                               \
+  T(PFNEGLGETSTREAMFILEDESCRIPTORKHRPROC, eglGetStreamFileDescriptorKHR) \
+  T(PFNEGLQUERYDEVICESEXTPROC, eglQueryDevicesEXT)                       \
+  T(PFNEGLGETPLATFORMDISPLAYEXTPROC, eglGetPlatformDisplayEXT)           \
+  T(PFNEGLQUERYDEVICEATTRIBEXTPROC, eglQueryDeviceAttribEXT)             \
+  T(PFNEGLCREATESTREAMFROMFILEDESCRIPTORKHRPROC,                         \
+    eglCreateStreamFromFileDescriptorKHR)
+
+#define eglCreateStreamKHR my_eglCreateStreamKHR
+#define eglDestroyStreamKHR my_eglDestroyStreamKHR
+#define eglQueryStreamKHR my_eglQueryStreamKHR
+#define eglQueryStreamu64KHR my_eglQueryStreamu64KHR
+#define eglQueryStreamTimeKHR my_eglQueryStreamTimeKHR
+#define eglStreamAttribKHR my_eglStreamAttribKHR
+#define eglStreamConsumerAcquireKHR my_eglStreamConsumerAcquireKHR
+#define eglStreamConsumerReleaseKHR my_eglStreamConsumerReleaseKHR
+#define eglStreamConsumerGLTextureExternalKHR \
+  my_eglStreamConsumerGLTextureExternalKHR
+#define eglGetStreamFileDescriptorKHR my_eglGetStreamFileDescriptorKHR
+#define eglCreateStreamFromFileDescriptorKHR \
+  my_eglCreateStreamFromFileDescriptorKHR
+#define eglQueryDevicesEXT my_eglQueryDevicesEXT
+#define eglGetPlatformDisplayEXT my_eglGetPlatformDisplayEXT
+#define eglQueryDeviceAttribEXT my_eglQueryDeviceAttribEXT
+
+#define EXTLST_DECL(tx, x) tx my_##x = NULL;
+#define EXTLST_EXTERN(tx, x) extern tx my_##x;
+#define EXTLST_ENTRY(tx, x) {(extlst_fnptr_t *)&my_##x, #x},
+
+#define MAX_STRING_SIZE 256
+#define WIDTH 720
+#define HEIGHT 480
+
+typedef struct _TestArgs {
+  char *infile1;
+  char *infile2;
+  bool isARGB;
+  unsigned int inputWidth;
+  unsigned int inputHeight;
+  bool pitchLinearOutput;
+} TestArgs;
+
+int eglSetupExtensions(void);
+int EGLStreamInit(int *dev);
+void EGLStreamFini(void);
+#endif
--- a/Samples/EGLStream_CUDA_Interop/findegl.mk
+++ b/Samples/EGLStream_CUDA_Interop/findegl.mk
@ -0,0 +1,156 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+#  findegl.mk is used to find the necessary EGL Libraries for specific distributions
+#               this is supported on Linux
+#
+################################################################################
+
+# Determine OS platform and unix distribution
+ifeq ("$(TARGET_OS)","linux")
+   # first search lsb_release
+   DISTRO  = $(shell lsb_release -i -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+   ifeq ("$(DISTRO)","")
+     # second search and parse /etc/issue
+     DISTRO = $(shell more /etc/issue | awk '{print $$1}' | sed '1!d' | sed -e "/^$$/d" 2>/dev/null | tr "[:upper:]" "[:lower:]")
+     # ensure data from /etc/issue is valid
+     ifeq (,$(filter $(DISTRO),ubuntu fedora red rhel centos suse))
+       DISTRO = 
+     endif
+     ifeq ("$(DISTRO)","")
+       # third, we can search in /etc/os-release or /etc/{distro}-release
+       DISTRO = $(shell awk '/ID/' /etc/*-release | sed 's/ID=//' | grep -v "VERSION" | grep -v "ID" | grep -v "DISTRIB")
+     endif
+   endif
+endif
+
+ifeq ("$(TARGET_OS)","linux")
+    # $(info) >> findegl.mk -> LINUX path <<<)
+    # Each set of Linux Distros have different paths for where to find their OpenGL libraries reside
+    UBUNTU = $(shell echo $(DISTRO) | grep -i ubuntu      >/dev/null 2>&1; echo $$?)
+    FEDORA = $(shell echo $(DISTRO) | grep -i fedora      >/dev/null 2>&1; echo $$?)
+    RHEL   = $(shell echo $(DISTRO) | grep -i 'red\|rhel' >/dev/null 2>&1; echo $$?)
+    CENTOS = $(shell echo $(DISTRO) | grep -i centos      >/dev/null 2>&1; echo $$?)
+    SUSE   = $(shell echo $(DISTRO) | grep -i 'suse\|sles' >/dev/null 2>&1; echo $$?)
+    ifeq ("$(UBUNTU)","0")
+      ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        GLPATH := /usr/arm-linux-gnueabihf/lib
+        GLLINK := -L/usr/arm-linux-gnueabihf/lib
+        ifneq ($(TARGET_FS),) 
+          GLPATH += $(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+          GLLINK += -L$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+      else ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-aarch64)
+        GLPATH := /usr/aarch64-linux-gnu/lib
+        GLLINK := -L/usr/aarch64-linux-gnu/lib
+        ifneq ($(TARGET_FS),)
+          GLPATH += $(TARGET_FS)/usr/lib
+          GLPATH += $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+          GLLINK += -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+        endif 
+      else
+        UBUNTU_PKG_NAME = $(shell which dpkg >/dev/null 2>&1 && dpkg -l 'nvidia-*' | grep '^ii' | awk '{print $$2}' | head -1)
+        ifneq ("$(UBUNTU_PKG_NAME)","")
+          GLPATH    ?= /usr/lib/$(UBUNTU_PKG_NAME)
+          GLLINK    ?= -L/usr/lib/$(UBUNTU_PKG_NAME)
+        endif
+
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+    ifeq ("$(SUSE)","0")
+      GLPATH    ?= /usr/X11R6/lib64
+      GLLINK    ?= -L/usr/X11R6/lib64
+      DFLT_PATH ?= /usr/lib64
+    endif
+    ifeq ("$(FEDORA)","0")
+      GLPATH    ?= /usr/lib64/nvidia
+      GLLINK    ?= -L/usr/lib64/nvidia
+      DFLT_PATH ?= /usr/lib64
+    endif
+    ifeq ("$(RHEL)","0")
+      GLPATH    ?= /usr/lib64/nvidia
+      GLLINK    ?= -L/usr/lib64/nvidia
+      DFLT_PATH ?= /usr/lib64
+    endif
+    ifeq ("$(CENTOS)","0")
+      GLPATH    ?= /usr/lib64/nvidia
+      GLLINK    ?= -L/usr/lib64/nvidia
+      DFLT_PATH ?= /usr/lib64
+    endif
+
+  EGLLIB  := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libEGL.so    -print 2>/dev/null)
+
+  ifeq ("$(EGLLIB)","")
+      $(info >>> WARNING - libEGL.so not found, please install libEGL.so <<<)
+      SAMPLE_ENABLED := 0
+  endif
+
+  HEADER_SEARCH_PATH ?= $(TARGET_FS)/usr/include
+  ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+      HEADER_SEARCH_PATH += /usr/arm-linux-gnueabihf/include
+  else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-aarch64-linux)
+      HEADER_SEARCH_PATH += /usr/aarch64-linux-gnu/include
+  endif
+
+  EGLHEADER  := $(shell find -L $(HEADER_SEARCH_PATH) -name egl.h -print 2>/dev/null)
+  EGLEXTHEADER  := $(shell find -L $(HEADER_SEARCH_PATH) -name eglext.h -print 2>/dev/null)
+
+  ifeq ("$(EGLHEADER)","")
+      $(info >>> WARNING - egl.h not found, please install egl.h <<<)
+      SAMPLE_ENABLED := 0
+  endif
+  ifeq ("$(EGLEXTHEADER)","")
+      $(info >>> WARNING - eglext.h not found, please install eglext.h <<<)
+      SAMPLE_ENABLED := 0
+  endif
+else
+endif
+
+# Attempt to compile a minimal EGL application and run to check if EGL_SUPPORT_REUSE_NV is supported in the EGL headers available.
+ifneq ($(SAMPLE_ENABLED), 0)
+      $(shell printf "#include <EGL/egl.h>\n#include <EGL/eglext.h>\nint main() {\n#ifdef EGL_SUPPORT_REUSE_NV \n #error \"Compatible EGL header found\" \n  return 0;\n#endif \n return 1;\n}"  > test.c; )
+      EGL_DEFINES := $(shell $(HOST_COMPILER) $(CCFLAGS) $(EXTRA_CCFLAGS) -lEGL test.c -c 2>&1 | grep -ic "Compatible EGL header found";)
+      SHOULD_WAIVE := 0
+      ifeq ($(EGL_DEFINES),0)
+        SHOULD_WAIVE := 1
+      endif
+      ifeq ($(SHOULD_WAIVE),1)
+          $(info -----------------------------------------------------------------------------------------------)
+          $(info WARNING - NVIDIA EGL EXTENSIONS are not available in the present EGL headers)
+          $(info -----------------------------------------------------------------------------------------------)
+          $(info   This CUDA Sample cannot be built if the EGL NVIDIA EXTENSIONS like EGL_SUPPORT_REUSE_NV are not supported in EGL headers.)
+          $(info   This will be a dry-run of the Makefile.)
+          $(info   Please install the latest khronos EGL headers and libs to build this sample)
+          $(info -----------------------------------------------------------------------------------------------)
+          SAMPLE_ENABLED := 0
+      endif
+      $(shell rm test.o test.c 2>/dev/null)
+endif
+
--- a/Samples/EGLStream_CUDA_Interop/main.cpp
+++ b/Samples/EGLStream_CUDA_Interop/main.cpp
@ -0,0 +1,231 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+//
+// DESCRIPTION:   Simple EGL stream sample app
+//
+//
+
+//#define EGL_EGLEXT_PROTOTYPES
+
+#include "cudaEGL.h"
+#include "cuda_consumer.h"
+#include "cuda_producer.h"
+#include "eglstrm_common.h"
+
+/* ------  globals ---------*/
+
+#if defined(EXTENSION_LIST)
+EXTENSION_LIST(EXTLST_EXTERN)
+#endif
+
+#define NUM_TRAILS 4
+
+bool signal_stop = 0;
+
+static void sig_handler(int sig) {
+  signal_stop = 1;
+  printf("Signal: %d\n", sig);
+}
+
+int main(int argc, char **argv) {
+  TestArgs args;
+  CUresult curesult = CUDA_SUCCESS;
+  unsigned int i, j;
+  EGLint streamState = 0;
+
+  test_cuda_consumer_s cudaConsumer;
+  test_cuda_producer_s cudaProducer;
+
+  memset(&cudaProducer, 0, sizeof(test_cuda_producer_s));
+  memset(&cudaConsumer, 0, sizeof(test_cuda_consumer_s));
+
+  // Hook up Ctrl-C handler
+  signal(SIGINT, sig_handler);
+  if (!eglSetupExtensions()) {
+    printf("SetupExtentions failed \n");
+    curesult = CUDA_ERROR_UNKNOWN;
+    goto done;
+  }
+
+  checkCudaErrors(cuInit(0));
+
+  int count;
+
+  checkCudaErrors(cuDeviceGetCount(&count));
+  printf("Found %d cuda devices\n", count);
+
+  CUdevice devId;
+
+  if (!EGLStreamInit(&devId)) {
+    printf("EGLStream Init failed.\n");
+    curesult = CUDA_ERROR_UNKNOWN;
+    goto done;
+  }
+  curesult = cudaDeviceCreateProducer(&cudaProducer, devId);
+  if (curesult != CUDA_SUCCESS) {
+    goto done;
+  }
+  curesult = cudaDeviceCreateConsumer(&cudaConsumer, devId);
+  if (curesult != CUDA_SUCCESS) {
+    goto done;
+  }
+  checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context));
+  if (CUDA_SUCCESS != (curesult = cuEGLStreamConsumerConnect(
+                           &(cudaConsumer.cudaConn), eglStream))) {
+    printf("FAILED Connect CUDA consumer  with error %d\n", curesult);
+    goto done;
+  } else {
+    printf("Connected CUDA consumer, CudaConsumer %p\n", cudaConsumer.cudaConn);
+  }
+  checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context));
+
+  checkCudaErrors(cuCtxPushCurrent(cudaProducer.context));
+  if (CUDA_SUCCESS ==
+      (curesult = cuEGLStreamProducerConnect(&(cudaProducer.cudaConn),
+                                             eglStream, WIDTH, HEIGHT))) {
+    printf("Connect CUDA producer Done, CudaProducer %p\n",
+           cudaProducer.cudaConn);
+  } else {
+    printf("Connect CUDA producer FAILED with error %d\n", curesult);
+    goto done;
+  }
+  checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context));
+
+  // Initialize producer
+  for (i = 0; i < NUM_TRAILS; i++) {
+    if (streamState != EGL_STREAM_STATE_CONNECTING_KHR) {
+      if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR,
+                             &streamState)) {
+        printf("main: eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
+        curesult = CUDA_ERROR_UNKNOWN;
+        goto done;
+      }
+    }
+    args.inputWidth = WIDTH;
+    args.inputHeight = HEIGHT;
+    if (i % 2 != 0) {
+      args.isARGB = 1;
+      args.infile1 = sdkFindFilePath("cuda_f_1.yuv", argv[0]);
+      args.infile2 = sdkFindFilePath("cuda_f_2.yuv", argv[0]);
+    } else {
+      args.isARGB = 0;
+      args.infile1 = sdkFindFilePath("cuda_yuv_f_1.yuv", argv[0]);
+      args.infile2 = sdkFindFilePath("cuda_yuv_f_2.yuv", argv[0]);
+    }
+    if ((i % 4) < 2) {
+      args.pitchLinearOutput = 1;
+    } else {
+      args.pitchLinearOutput = 0;
+    }
+
+    checkCudaErrors(cuCtxPushCurrent(cudaProducer.context));
+    cudaProducerInit(&cudaProducer, g_display, eglStream, &args);
+    checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context));
+
+    checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context));
+    cuda_consumer_init(&cudaConsumer, &args);
+    checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context));
+
+    printf("main - Cuda Producer and Consumer Initialized.\n");
+
+    for (j = 0; j < 2; j++) {
+      printf("Running for %s frame and %s input\n",
+             args.isARGB ? "ARGB" : "YUV",
+             args.pitchLinearOutput ? "Pitchlinear" : "BlockLinear");
+      if (j == 0) {
+        checkCudaErrors(cuCtxPushCurrent(cudaProducer.context));
+        curesult = cudaProducerTest(&cudaProducer, cudaProducer.fileName1);
+        if (curesult != CUDA_SUCCESS) {
+          printf("Cuda Producer Test failed for frame = %d\n", j + 1);
+          goto done;
+        }
+        checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context));
+        checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context));
+        curesult = cudaConsumerTest(&cudaConsumer, cudaConsumer.outFile1);
+        if (curesult != CUDA_SUCCESS) {
+          printf("Cuda Consumer Test failed for frame = %d\n", j + 1);
+          goto done;
+        }
+        checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context));
+      } else {
+        checkCudaErrors(cuCtxPushCurrent(cudaProducer.context));
+        curesult = cudaProducerTest(&cudaProducer, cudaProducer.fileName2);
+        if (curesult != CUDA_SUCCESS) {
+          printf("Cuda Producer Test failed for frame = %d\n", j + 1);
+          goto done;
+        }
+
+        checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context));
+        checkCudaErrors(cuCtxPushCurrent(cudaConsumer.context));
+        curesult = cudaConsumerTest(&cudaConsumer, cudaConsumer.outFile2);
+        if (curesult != CUDA_SUCCESS) {
+          printf("Cuda Consumer Test failed for frame = %d\n", j + 1);
+          goto done;
+        }
+        checkCudaErrors(cuCtxPopCurrent(&cudaConsumer.context));
+      }
+    }
+  }
+
+  checkCudaErrors(cuCtxPushCurrent(cudaProducer.context));
+  if (CUDA_SUCCESS != (curesult = cudaProducerDeinit(&cudaProducer))) {
+    printf("Producer Disconnect FAILED. \n");
+    goto done;
+  }
+  checkCudaErrors(cuCtxPopCurrent(&cudaProducer.context));
+
+  if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR,
+                         &streamState)) {
+    printf("Cuda consumer, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
+    curesult = CUDA_ERROR_UNKNOWN;
+    goto done;
+  }
+  if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) {
+    if (CUDA_SUCCESS != (curesult = cuda_consumer_deinit(&cudaConsumer))) {
+      printf("Consumer Disconnect FAILED.\n");
+      goto done;
+    }
+  }
+  printf("Producer and Consumer Disconnected \n");
+
+done:
+  if (!eglQueryStreamKHR(g_display, eglStream, EGL_STREAM_STATE_KHR,
+                         &streamState)) {
+    printf("Cuda consumer, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
+    curesult = CUDA_ERROR_UNKNOWN;
+  }
+  if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) {
+    EGLStreamFini();
+  }
+
+  if (curesult == CUDA_SUCCESS) {
+    printf("&&&& EGLStream interop test PASSED\n");
+  } else {
+    printf("&&&& EGLStream interop test FAILED\n");
+  }
+  return 0;
+}
--- a/Samples/NV12toBGRandResize/Makefile
+++ b/Samples/NV12toBGRandResize/Makefile
@ -0,0 +1,322 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+endif
+
+ifeq ($(TARGET_OS),qnx)
+    CCFLAGS += -DWIN_INTERFACE_CUSTOM
+    LDFLAGS += -lsocket
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - NV12toBGRandResize is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
+SMS ?= 30 35 37 50 52 60 61 70 75
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: NV12toBGRandResize
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+bgr_resize.o:bgr_resize.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+nv12_resize.o:nv12_resize.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+nv12_to_bgr_planar.o:nv12_to_bgr_planar.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+resize_convert_main.o:resize_convert_main.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+utils.o:utils.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+NV12toBGRandResize: bgr_resize.o nv12_resize.o nv12_to_bgr_planar.o resize_convert_main.o utils.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./NV12toBGRandResize
+
+clean:
+	rm -f NV12toBGRandResize bgr_resize.o nv12_resize.o nv12_to_bgr_planar.o resize_convert_main.o utils.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/NV12toBGRandResize
+
+clobber: clean
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2012.sln
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2012.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NV12toBGRandResize", "NV12toBGRandResize_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2012.vcxproj
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2012.vcxproj
@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>NV12toBGRandResize_vs2012</RootNamespace>
+    <ProjectName>NV12toBGRandResize</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bgr_resize.cu" />
+    <CudaCompile Include="nv12_resize.cu" />
+    <CudaCompile Include="nv12_to_bgr_planar.cu" />
+    <ClCompile Include="resize_convert_main.cpp" />
+    <CudaCompile Include="utils.cu" />
+    <ClInclude Include="resize_convert.h" />
+    <ClInclude Include="utils.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2013.sln
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2013.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 13.00
+# Visual Studio 2013
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NV12toBGRandResize", "NV12toBGRandResize_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2013.vcxproj
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2013.vcxproj
@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>NV12toBGRandResize_vs2013</RootNamespace>
+    <ProjectName>NV12toBGRandResize</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bgr_resize.cu" />
+    <CudaCompile Include="nv12_resize.cu" />
+    <CudaCompile Include="nv12_to_bgr_planar.cu" />
+    <ClCompile Include="resize_convert_main.cpp" />
+    <CudaCompile Include="utils.cu" />
+    <ClInclude Include="resize_convert.h" />
+    <ClInclude Include="utils.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2015.sln
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2015.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NV12toBGRandResize", "NV12toBGRandResize_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2015.vcxproj
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2015.vcxproj
@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>NV12toBGRandResize_vs2015</RootNamespace>
+    <ProjectName>NV12toBGRandResize</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bgr_resize.cu" />
+    <CudaCompile Include="nv12_resize.cu" />
+    <CudaCompile Include="nv12_to_bgr_planar.cu" />
+    <ClCompile Include="resize_convert_main.cpp" />
+    <CudaCompile Include="utils.cu" />
+    <ClInclude Include="resize_convert.h" />
+    <ClInclude Include="utils.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.sln
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NV12toBGRandResize", "NV12toBGRandResize_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2017.vcxproj
@ -0,0 +1,117 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>NV12toBGRandResize_vs2017</RootNamespace>
+    <ProjectName>NV12toBGRandResize</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bgr_resize.cu" />
+    <CudaCompile Include="nv12_resize.cu" />
+    <CudaCompile Include="nv12_to_bgr_planar.cu" />
+    <ClCompile Include="resize_convert_main.cpp" />
+    <CudaCompile Include="utils.cu" />
+    <ClInclude Include="resize_convert.h" />
+    <ClInclude Include="utils.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.sln
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NV12toBGRandResize", "NV12toBGRandResize_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj
+++ b/Samples/NV12toBGRandResize/NV12toBGRandResize_vs2019.vcxproj
@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>NV12toBGRandResize_vs2019</RootNamespace>
+    <ProjectName>NV12toBGRandResize</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/NV12toBGRandResize.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bgr_resize.cu" />
+    <CudaCompile Include="nv12_resize.cu" />
+    <CudaCompile Include="nv12_to_bgr_planar.cu" />
+    <ClCompile Include="resize_convert_main.cpp" />
+    <CudaCompile Include="utils.cu" />
+    <ClInclude Include="resize_convert.h" />
+    <ClInclude Include="utils.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/NV12toBGRandResize/NsightEclipse.xml
+++ b/Samples/NV12toBGRandResize/NsightEclipse.xml
@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>NV12toBGRandResize</name>
+  <cuda_api_list>
+    <toolkit>cudaMemcpy2D</toolkit>
+    <toolkit>cudaMallocManaged</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This code shows two ways to convert and resize NV12 frames to BGR 3 planars frames using CUDA in batch. Way-1, Convert NV12 Input to BGR @ Input Resolution-1, then Resize to Resolution#2. Way-2, resize NV12 Input to Resolution#2 then convert it to BGR Output. NVIDIA HW Decoder, both dGPU and Tegra, normally outputs NV12 pitch format frames. For the inference using TensorRT, the input frame needs to be BGR planar format with possibly different size. So, conversion and resizing from NV12 to BGR planar is usually required for the inference following decoding. This CUDA code provides a reference implementation for conversion and resizing.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Graphics Interop</concept>
+    <concept level="basic">Image Processing</concept>
+    <concept level="basic">Video Processing</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>GPGPU</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>resize_convert_main.cpp</primary_file>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>2:Image Processing</scope>
+    <scope>2:Computer Vision</scope>
+  </scopes>
+  <sm-arch>sm30</sm-arch>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>aarch64</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>NV12toBGRandResize</title>
+  <type>exe</type>
+</entry>
--- a/Samples/NV12toBGRandResize/README.md
+++ b/Samples/NV12toBGRandResize/README.md
@ -0,0 +1,94 @@
+# NV12toBGRandResize - NV12toBGRandResize
+
+## Description
+
+This code shows two ways to convert and resize NV12 frames to BGR 3 planars frames using CUDA in batch. Way-1, Convert NV12 Input to BGR @ Input Resolution-1, then Resize to Resolution#2. Way-2, resize NV12 Input to Resolution#2 then convert it to BGR Output. NVIDIA HW Decoder, both dGPU and Tegra, normally outputs NV12 pitch format frames. For the inference using TensorRT, the input frame needs to be BGR planar format with possibly different size. So, conversion and resizing from NV12 to BGR planar is usually required for the inference following decoding. This CUDA code provides a reference implementation for conversion and resizing.
+
+## Key Concepts
+
+Graphics Interop, Image Processing, Video Processing
+
+## Supported SM Architectures
+
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows, MacOSX
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, aarch64
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaMemcpy2D, cudaMallocManaged
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+### Mac
+The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+
+The samples makefiles can take advantage of certain options:
+
+*  **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+
+*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
+    ```
+    $ make SMS="A B ..."
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
+    ```
+    $ make HOST_COMPILER=clang
+    ```
+
+## References (for more details)
+
--- a/Samples/NV12toBGRandResize/bgr_resize.cu
+++ b/Samples/NV12toBGRandResize/bgr_resize.cu
@ -0,0 +1,134 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+// Implements BGR 3 progressive planars frames batch resize
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "resize_convert.h"
+
+__global__ void resizeBGRplanarBatchKernel(cudaTextureObject_t texSrc,
+    float *pDst, int nDstPitch, int nDstHeight, int nSrcHeight,
+    int batch, float scaleX, float scaleY,
+    int cropX, int cropY, int cropW, int cropH) {
+    int x = threadIdx.x + blockIdx.x * blockDim.x;
+    int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+    if (x >= (int)(cropW/scaleX) || y >= (int)(cropH/scaleY))
+        return;
+
+    int frameSize = nDstPitch*nDstHeight;
+    float *p = NULL;
+    for (int i = blockIdx.z; i < batch; i += gridDim.z) {
+        #pragma unroll
+        for (int channel=0; channel < 3; channel++){
+            p = pDst + i * 3 * frameSize + y * nDstPitch + x + channel * frameSize;
+            *p = tex2D<float>(texSrc, x * scaleX + cropX,
+                                ((3 * i + channel) * nSrcHeight + y * scaleY + cropY));
+        }
+    }
+}
+
+
+static void resizeBGRplanarBatchCore(
+        float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight,
+        float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight,
+        int nBatchSize, cudaStream_t stream, bool whSameResizeRatio,
+        int cropX, int cropY, int cropW, int cropH) {
+    cudaTextureObject_t texSrc[2];
+    int nTiles = 1, h, iTile;
+
+    h = nSrcHeight * 3 * nBatchSize;
+    while ((h + nTiles - 1) / nTiles > 65536)
+        nTiles++;
+
+    if (nTiles > 2)
+        return;
+
+    int batchTile = nBatchSize / nTiles;
+    int batchTileLast = nBatchSize - batchTile * (nTiles-1);
+
+    for (iTile = 0; iTile < nTiles; ++iTile) {
+        int bs = (iTile == nTiles - 1) ? batchTileLast : batchTile;
+        float *dpSrcNew = dpSrc +
+            iTile * (batchTile * 3 * nSrcHeight * nSrcPitch);
+
+        cudaResourceDesc resDesc = {};
+        resDesc.resType = cudaResourceTypePitch2D;
+        resDesc.res.pitch2D.devPtr = dpSrcNew;
+        resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
+        resDesc.res.pitch2D.width = nSrcWidth;
+        resDesc.res.pitch2D.height = bs * 3 * nSrcHeight;
+        resDesc.res.pitch2D.pitchInBytes = nSrcPitch * sizeof(float);
+        cudaTextureDesc texDesc = {};
+        texDesc.filterMode = cudaFilterModeLinear;
+        texDesc.readMode = cudaReadModeElementType;
+
+        checkCudaErrors(cudaCreateTextureObject(&texSrc[iTile], &resDesc, &texDesc, NULL));
+        float *dpDstNew = dpDst +
+            iTile * (batchTile * 3 * nDstHeight * nDstPitch);
+
+        if(cropW == 0 || cropH == 0) {
+            cropX = 0;
+            cropY = 0;
+            cropW = nSrcWidth;
+            cropH = nSrcHeight;
+        }
+
+        float scaleX = (cropW*1.0f / nDstWidth);
+        float scaleY = (cropH*1.0f / nDstHeight);
+
+        if(whSameResizeRatio == true)
+            scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
+        dim3 block(32, 32, 1);
+
+        size_t blockDimZ = bs;
+        // Restricting blocks in Z-dim till 32 to not launch too many blocks
+        blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ;
+        dim3 grid((cropW*1.0f/scaleX + block.x - 1) / block.x,
+                  (cropH*1.0f/scaleY + block.y - 1) / block.y, blockDimZ);
+
+        resizeBGRplanarBatchKernel<<<grid, block, 0, stream>>>
+                (texSrc[iTile], dpDstNew, nDstPitch, nDstHeight, nSrcHeight,
+                bs, scaleX, scaleY, cropX, cropY, cropW, cropH);
+
+    }
+
+    for (iTile = 0; iTile < nTiles; ++iTile)
+        checkCudaErrors(cudaDestroyTextureObject(texSrc[iTile]));
+}
+
+void resizeBGRplanarBatch(
+        float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight,
+        float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight,
+        int nBatchSize, cudaStream_t stream,
+        int cropX, int cropY, int cropW, int cropH, bool whSameResizeRatio) {
+    resizeBGRplanarBatchCore(dpSrc, nSrcPitch, nSrcWidth, nSrcHeight,
+        dpDst, nDstPitch, nDstWidth, nDstHeight, nBatchSize, stream,
+        whSameResizeRatio, cropX, cropY, cropW, cropH);
+}
--- a/Samples/NV12toBGRandResize/data/test1280x720.nv12
+++ b/Samples/NV12toBGRandResize/data/test1280x720.nv12
--- a/Samples/NV12toBGRandResize/data/test1920x1080.nv12
+++ b/Samples/NV12toBGRandResize/data/test1920x1080.nv12
--- a/Samples/NV12toBGRandResize/data/test640x480.nv12
+++ b/Samples/NV12toBGRandResize/data/test640x480.nv12
--- a/Samples/NV12toBGRandResize/nv12_resize.cu
+++ b/Samples/NV12toBGRandResize/nv12_resize.cu
@ -0,0 +1,112 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Implements interlace NV12 frames batch resize
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "resize_convert.h"
+
+__global__ static void resizeNV12BatchKernel(cudaTextureObject_t texSrcLuma,
+                                             cudaTextureObject_t texSrcChroma,
+                                             uint8_t *pDstNv12, int nSrcWidth,
+                                             int nSrcHeight, int nDstPitch,
+                                             int nDstWidth, int nDstHeight,
+                                             int nBatchSize) {
+  int x = threadIdx.x + blockIdx.x * blockDim.x;
+  int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+  int px = x * 2, py = y * 2;
+
+  if ((px + 1) >= nDstWidth || (py + 1) >= nDstHeight) return;
+
+  float fxScale = 1.0f * nSrcWidth / nDstWidth;
+  float fyScale = 1.0f * nSrcHeight / nDstHeight;
+
+  uint8_t *p = pDstNv12 + px + py * nDstPitch;
+  int hh = nDstHeight * 3 / 2;
+  int nByte = nDstPitch * hh;
+  int px_fxScale = px * fxScale;
+  int px_fxScale_1 = (px + 1) * fxScale;
+  int py_fyScale = py * fyScale;
+  int py_fyScale_1 = (py + 1) * fyScale;
+
+  for (int i = blockIdx.z; i < nBatchSize; i+=gridDim.z) {
+    *(uchar2 *)p = make_uchar2(tex2D<uint8_t>(texSrcLuma, px_fxScale, py_fyScale),
+                          tex2D<uint8_t>(texSrcLuma, px_fxScale_1, py_fyScale));
+    *(uchar2 *)(p + nDstPitch) =
+        make_uchar2(tex2D<uint8_t>(texSrcLuma, px_fxScale, py_fyScale_1),
+               tex2D<uint8_t>(texSrcLuma, px_fxScale_1, py_fyScale_1));
+    *(uchar2 *)(p + (nDstHeight - y) * nDstPitch) = tex2D<uchar2>(
+        texSrcChroma, x * fxScale, (hh * i + nDstHeight + y) * fyScale);
+    p += nByte;
+    py += hh;
+  }
+}
+
+void resizeNV12Batch(uint8_t *dpSrc, int nSrcPitch, int nSrcWidth,
+                     int nSrcHeight, uint8_t *dpDst, int nDstPitch,
+                     int nDstWidth, int nDstHeight, int nBatchSize,
+                     cudaStream_t stream) {
+  int hhSrc = ceilf(nSrcHeight * 3.0f / 2.0f);
+  cudaResourceDesc resDesc = {};
+  resDesc.resType = cudaResourceTypePitch2D;
+  resDesc.res.pitch2D.devPtr = dpSrc;
+  resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uint8_t>();
+  resDesc.res.pitch2D.width = nSrcWidth;
+  resDesc.res.pitch2D.height = hhSrc * nBatchSize;
+  resDesc.res.pitch2D.pitchInBytes = nSrcPitch;
+
+  cudaTextureDesc texDesc = {};
+  texDesc.filterMode = cudaFilterModePoint;
+  texDesc.readMode = cudaReadModeElementType;
+
+  cudaTextureObject_t texLuma = 0;
+  checkCudaErrors(cudaCreateTextureObject(&texLuma, &resDesc, &texDesc, NULL));
+
+  resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uchar2>();
+  resDesc.res.pitch2D.width /= 2;
+
+  cudaTextureObject_t texChroma = 0;
+  checkCudaErrors(cudaCreateTextureObject(&texChroma, &resDesc, &texDesc, NULL));
+
+  dim3 block(32, 32, 1);
+
+  size_t blockDimZ = nBatchSize;
+
+  // Restricting blocks in Z-dim till 32 to not launch too many blocks
+  blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ;
+
+  dim3 grid((nDstWidth / 2 + block.x) / block.x,
+            (nDstHeight / 2 + block.y) / block.y, blockDimZ);
+  resizeNV12BatchKernel<<<grid, block, 0, stream>>>(
+      texLuma, texChroma, dpDst, nSrcWidth, nSrcHeight, nDstPitch, nDstWidth,
+      nDstHeight, nBatchSize);
+
+  checkCudaErrors(cudaDestroyTextureObject(texLuma));
+  checkCudaErrors(cudaDestroyTextureObject(texChroma));
+}
--- a/Samples/NV12toBGRandResize/nv12_to_bgr_planar.cu
+++ b/Samples/NV12toBGRandResize/nv12_to_bgr_planar.cu
@ -0,0 +1,154 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+// Implements NV12 to BGR batch conversion
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "resize_convert.h"
+
+#define CONV_THREADS_X 64
+#define CONV_THREADS_Y 10
+
+__forceinline__ __device__ static float clampF(float x, float lower,
+                                               float upper) {
+  return x < lower ? lower : (x > upper ? upper : x);
+}
+
+__global__ static void nv12ToBGRplanarBatchKernel(const uint8_t *pNv12,
+                                                  int nNv12Pitch, float *pBgr,
+                                                  int nRgbPitch, int nWidth,
+                                                  int nHeight, int nBatchSize) {
+  int x = threadIdx.x + blockIdx.x * blockDim.x;
+  int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if ((x << 2) + 1 > nWidth || (y << 1) + 1 > nHeight) return;
+
+  const uint8_t *__restrict__ pSrc = pNv12;
+
+  for (int i = blockIdx.z; i < nBatchSize; i += gridDim.z) {
+    pSrc = pNv12 + i * ((nHeight * nNv12Pitch * 3) >> 1) + (x << 2) +
+           (y << 1) * nNv12Pitch;
+    uchar4 luma2x01, luma2x23, uv2;
+    *(uint32_t *)&luma2x01 = *(uint32_t *)pSrc;
+    *(uint32_t *)&luma2x23 = *(uint32_t *)(pSrc + nNv12Pitch);
+    *(uint32_t *)&uv2 = *(uint32_t *)(pSrc + (nHeight - y) * nNv12Pitch);
+
+    float *pDstBlock = (pBgr + i * ((nHeight * nRgbPitch * 3) >> 2) +
+                        ((blockIdx.x * blockDim.x) << 2) +
+                        ((blockIdx.y * blockDim.y) << 1) * (nRgbPitch >> 2));
+
+    float2 add1;
+    float2 add2;
+    float2 add3;
+    float2 add00, add01, add02, add03;
+    float2 d, e;
+
+    add00.x = 1.1644f * luma2x01.x;
+    add01.x = 1.1644f * luma2x01.y;
+    add00.y = 1.1644f * luma2x01.z;
+    add01.y = 1.1644f * luma2x01.w;
+
+    add02.x = 1.1644f * luma2x23.x;
+    add03.x = 1.1644f * luma2x23.y;
+    add02.y = 1.1644f * luma2x23.z;
+    add03.y = 1.1644f * luma2x23.w;
+
+    d.x = uv2.x - 128.0f;
+    e.x = uv2.y - 128.0f;
+    d.y = uv2.z - 128.0f;
+    e.y = uv2.w - 128.0f;
+
+    add1.x = 2.0172f * d.x;
+    add1.y = 2.0172f * d.y;
+
+    add2.x = (-0.3918f) * d.x + (-0.8130f) * e.x;
+    add2.y = (-0.3918f) * d.y + (-0.8130f) * e.y;
+
+    add3.x = 1.5960f * e.x;
+    add3.y = 1.5960f * e.y;
+
+    int rowStride = (threadIdx.y << 1) * (nRgbPitch >> 2);
+    int nextRowStride = ((threadIdx.y << 1) + 1) * (nRgbPitch >> 2);
+    // B
+    *((float4 *)&pDstBlock[rowStride + (threadIdx.x << 2)]) =
+        make_float4(clampF(add00.x + add1.x, 0.0f, 255.0f),
+                    clampF(add01.x + add1.x, 0.0f, 255.0f),
+                    clampF(add00.y + add1.y, 0.0f, 255.0f),
+                    clampF(add01.y + add1.y, 0.0f, 255.0f));
+    *((float4 *)&pDstBlock[nextRowStride + (threadIdx.x << 2)]) =
+        make_float4(clampF(add02.x + add1.x, 0.0f, 255.0f),
+                    clampF(add03.x + add1.x, 0.0f, 255.0f),
+                    clampF(add02.y + add1.y, 0.0f, 255.0f),
+                    clampF(add03.y + add1.y, 0.0f, 255.0f));
+
+    int planeStride = nHeight * nRgbPitch >> 2;
+    // G
+    *((float4 *)&pDstBlock[planeStride + rowStride + (threadIdx.x << 2)]) =
+        make_float4(clampF(add00.x + add2.x, 0.0f, 255.0f),
+                    clampF(add01.x + add2.x, 0.0f, 255.0f),
+                    clampF(add00.y + add2.y, 0.0f, 255.0f),
+                    clampF(add01.y + add2.y, 0.0f, 255.0f));
+    *((float4 *)&pDstBlock[planeStride + nextRowStride + (threadIdx.x << 2)]) =
+        make_float4(clampF(add02.x + add2.x, 0.0f, 255.0f),
+                    clampF(add03.x + add2.x, 0.0f, 255.0f),
+                    clampF(add02.y + add2.y, 0.0f, 255.0f),
+                    clampF(add03.y + add2.y, 0.0f, 255.0f));
+
+    // R
+    *((float4
+           *)&pDstBlock[(planeStride << 1) + rowStride + (threadIdx.x << 2)]) =
+        make_float4(clampF(add00.x + add3.x, 0.0f, 255.0f),
+                    clampF(add01.x + add3.x, 0.0f, 255.0f),
+                    clampF(add00.y + add3.y, 0.0f, 255.0f),
+                    clampF(add01.y + add3.y, 0.0f, 255.0f));
+    *((float4 *)&pDstBlock[(planeStride << 1) + nextRowStride +
+                           (threadIdx.x << 2)]) =
+        make_float4(clampF(add02.x + add3.x, 0.0f, 255.0f),
+                    clampF(add03.x + add3.x, 0.0f, 255.0f),
+                    clampF(add02.y + add3.y, 0.0f, 255.0f),
+                    clampF(add03.y + add3.y, 0.0f, 255.0f));
+  }
+}
+
+void nv12ToBGRplanarBatch(uint8_t *pNv12, int nNv12Pitch, float *pBgr,
+                          int nRgbPitch, int nWidth, int nHeight,
+                          int nBatchSize, cudaStream_t stream) {
+  dim3 threads(CONV_THREADS_X, CONV_THREADS_Y);
+
+  size_t blockDimZ = nBatchSize;
+
+  // Restricting blocks in Z-dim till 32 to not launch too many blocks
+  blockDimZ = (blockDimZ > 32) ? 32 : blockDimZ;
+
+  dim3 blocks((nWidth / 4 - 1) / threads.x + 1,
+              (nHeight / 2 - 1) / threads.y + 1, blockDimZ);
+  nv12ToBGRplanarBatchKernel<<<blocks, threads, 0, stream>>>(
+      pNv12, nNv12Pitch, pBgr, nRgbPitch, nWidth, nHeight, nBatchSize);
+}
--- a/Samples/NV12toBGRandResize/resize_convert.h
+++ b/Samples/NV12toBGRandResize/resize_convert.h
@ -0,0 +1,56 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef __H_RESIZE_CONVERT__
+#define __H_RESIZE_CONVERT__
+
+#include <iostream>
+#include <helper_cuda.h>
+
+// nv12 resize
+extern "C"
+void resizeNV12Batch(
+    uint8_t *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight,
+    uint8_t *dpDst, int nDstPitch, int nDstWidth, int nDstHeight,
+    int nBatchSize, cudaStream_t stream = 0);
+
+// bgr resize
+extern "C"
+void resizeBGRplanarBatch(
+    float *dpSrc, int nSrcPitch, int nSrcWidth, int nSrcHeight,
+    float *dpDst, int nDstPitch, int nDstWidth, int nDstHeight,
+    int nBatchSize, cudaStream_t stream = 0,
+    int cropX = 0, int cropY = 0, int cropW = 0, int cropH = 0,
+    bool whSameResizeRatio = false);
+
+//NV12 to bgr planar
+extern "C"
+void nv12ToBGRplanarBatch(uint8_t *pNv12, int nNv12Pitch,
+    float *pRgb, int nRgbPitch, int nWidth, int nHeight,
+    int nBatchSize, cudaStream_t stream=0);
+#endif
--- a/Samples/NV12toBGRandResize/resize_convert_main.cpp
+++ b/Samples/NV12toBGRandResize/resize_convert_main.cpp
@ -0,0 +1,448 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+/*
+NVIDIA HW Decoder, both dGPU and Tegra, normally outputs NV12 pitch format
+frames. For the inference using TensorRT, the input frame needs to be BGR planar
+format with possibly different size. So, conversion and resizing from NV12 to
+BGR planar is usually required for the inference following decoding.
+This CUDA code is to provide a reference implementation for conversion and
+resizing.
+
+Limitaion
+=========
+    NV12resize needs the height to be a even value.
+
+Note
+====
+    Resize function needs the pitch of image buffer to be 32 alignment.
+
+Run
+====
+./NV12toBGRandResize
+   OR
+./NV12toBGRandResize -input=data/test1920x1080.nv12 -width=1920 -height=1080 \
+-dst_width=640 -dst_height=480 -batch=40 -device=0
+
+*/
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+#include "resize_convert.h"
+#include "utils.h"
+
+#define TEST_LOOP 20
+
+typedef struct _nv12_to_bgr24_context_t {
+  int width;
+  int height;
+  int pitch;
+
+  int dst_width;
+  int dst_height;
+  int dst_pitch;
+
+  int batch;
+  int device;  // cuda device ID
+
+  char *input_nv12_file;
+
+  int ctx_pitch;    // the value will be suitable for Texture memroy.
+  int ctx_heights;  // the value will be even.
+
+} nv12_to_bgr24_context;
+
+nv12_to_bgr24_context g_ctx;
+
+static void printHelp(const char *app_name) {
+  std::cout << "Usage:" << app_name << " [options]\n\n";
+  std::cout << "OPTIONS:\n";
+  std::cout << "\t-h,--help\n\n";
+  std::cout << "\t-input=nv12file             nv12 input file\n";
+  std::cout
+      << "\t-width=width                input nv12 image width, <1 -- 4096>\n";
+  std::cout
+      << "\t-height=height              input nv12 image height, <1 -- 4096>\n";
+  std::cout
+      << "\t-pitch=pitch(optional)      input nv12 image pitch, <0 -- 4096>\n";
+  std::cout
+      << "\t-dst_width=width            output BGR image width, <1 -- 4096>\n";
+  std::cout
+      << "\t-dst_height=height          output BGR image height, <1 -- 4096>\n";
+  std::cout
+      << "\t-dst_pitch=pitch(optional)  output BGR image pitch, <0 -- 4096>\n";
+  std::cout
+      << "\t-batch=batch                process frames count, <1 -- 4096>\n\n";
+  std::cout
+      << "\t-device=device_num(optional)   cuda device number, <0 -- 4096>\n\n";
+
+  return;
+}
+
+int parseCmdLine(int argc, char *argv[]) {
+  char **argp = (char **)argv;
+  char *arg = (char *)argv[0];
+
+  memset(&g_ctx, 0, sizeof(g_ctx));
+
+  if ((arg && (!strcmp(arg, "-h") || !strcmp(arg, "--help")))) {
+    printHelp(argv[0]);
+    return -1;
+  }
+
+  if (argc == 1) {
+    // Run using default arguments
+
+    g_ctx.input_nv12_file = sdkFindFilePath("test1920x1080.nv12", argv[0]);
+    if (g_ctx.input_nv12_file == NULL) {
+      printf("Cannot find input file test1920x1080.nv12\n Exiting\n");
+      return EXIT_FAILURE;
+    }
+    g_ctx.width = 1920;
+    g_ctx.height = 1080;
+    g_ctx.dst_width = 640;
+    g_ctx.dst_height = 480;
+    g_ctx.batch = 24;
+  } else if (argc > 1) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "width")) {
+      g_ctx.width = getCmdLineArgumentInt(argc, (const char **)argv, "width");
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "height")) {
+      g_ctx.height = getCmdLineArgumentInt(argc, (const char **)argv, "height");
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "pitch")) {
+      g_ctx.pitch = getCmdLineArgumentInt(argc, (const char **)argv, "pitch");
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
+      getCmdLineArgumentString(argc, (const char **)argv, "input",
+                               (char **)&g_ctx.input_nv12_file);
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "dst_width")) {
+      g_ctx.dst_width =
+          getCmdLineArgumentInt(argc, (const char **)argv, "dst_width");
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "dst_height")) {
+      g_ctx.dst_height =
+          getCmdLineArgumentInt(argc, (const char **)argv, "dst_height");
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "dst_pitch")) {
+      g_ctx.dst_pitch =
+          getCmdLineArgumentInt(argc, (const char **)argv, "dst_pitch");
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "batch")) {
+      g_ctx.batch = getCmdLineArgumentInt(argc, (const char **)argv, "batch");
+    }
+  }
+
+  g_ctx.device = findCudaDevice(argc, (const char **)argv);
+
+  if ((g_ctx.width == 0) || (g_ctx.height == 0) || (g_ctx.dst_width == 0) ||
+      (g_ctx.dst_height == 0) || !g_ctx.input_nv12_file) {
+    printHelp(argv[0]);
+    return -1;
+  }
+
+  if (g_ctx.pitch == 0) g_ctx.pitch = g_ctx.width;
+  if (g_ctx.dst_pitch == 0) g_ctx.dst_pitch = g_ctx.dst_width;
+
+  return 0;
+}
+
+/*
+  load nv12 yuvfile data into GPU device memory with batch of copy
+ */
+static int loadNV12Frame(unsigned char *d_inputNV12) {
+  unsigned char *pNV12FrameData;
+  unsigned char *d_nv12;
+  int frameSize;
+  std::ifstream nv12File(g_ctx.input_nv12_file, std::ifstream::in | std::ios::binary);
+
+  if (!nv12File.is_open()) {
+    std::cerr << "Can't open files\n";
+    return -1;
+  }
+
+  frameSize = g_ctx.pitch * g_ctx.ctx_heights;
+
+#if USE_UVM_MEM
+  pNV12FrameData = d_inputNV12;
+#else
+  pNV12FrameData = (unsigned char *)malloc(frameSize);
+  if (pNV12FrameData == NULL) {
+    std::cerr << "Failed to malloc pNV12FrameData\n";
+    return -1;
+  }
+#endif
+
+  nv12File.read((char *)pNV12FrameData, frameSize);
+
+  if (nv12File.gcount() < frameSize) {
+    std::cerr << "can't get one frame!\n";
+    return -1;
+  }
+
+#if USE_UVM_MEM
+  // Prefetch to GPU for following GPU operation
+  cudaStreamAttachMemAsync(NULL, pNV12FrameData, 0, cudaMemAttachGlobal);
+#endif
+
+  // expand one frame to multi frames for batch processing
+  d_nv12 = d_inputNV12;
+  for (int i = 0; i < g_ctx.batch; i++) {
+    checkCudaErrors(cudaMemcpy2D((void *)d_nv12, g_ctx.ctx_pitch,
+                                 pNV12FrameData, g_ctx.width, g_ctx.width,
+                                 g_ctx.ctx_heights, cudaMemcpyHostToDevice));
+
+    d_nv12 += g_ctx.ctx_pitch * g_ctx.ctx_heights;
+  }
+
+#if (USE_UVM_MEM == 0)
+  free(pNV12FrameData);
+#endif
+  nv12File.close();
+
+  return 0;
+}
+
+/*
+  1. resize interlace nv12 to target size
+  2. convert nv12 to bgr 3 progressive planars
+ */
+void nv12ResizeAndNV12ToBGR(unsigned char *d_inputNV12) {
+  unsigned char *d_resizedNV12;
+  float *d_outputBGR;
+  int size;
+  char filename[40];
+
+  /* allocate device memory for resized nv12 output */
+  size = g_ctx.dst_width * ceil(g_ctx.dst_height * 3.0f / 2.0f) * g_ctx.batch *
+         sizeof(unsigned char);
+  checkCudaErrors(cudaMalloc((void **)&d_resizedNV12, size));
+
+  /* allocate device memory for bgr output */
+  size = g_ctx.dst_pitch * g_ctx.dst_height * 3 * g_ctx.batch * sizeof(float);
+  checkCudaErrors(cudaMalloc((void **)&d_outputBGR, size));
+
+  cudaStream_t stream;
+  checkCudaErrors(cudaStreamCreate(&stream));
+  /* create cuda event handles */
+  cudaEvent_t start, stop;
+  checkCudaErrors(cudaEventCreate(&start));
+  checkCudaErrors(cudaEventCreate(&stop));
+  float elapsedTime = 0.0f;
+
+  /* resize interlace nv12 */
+
+  cudaEventRecord(start, 0);
+  for (int i = 0; i < TEST_LOOP; i++) {
+    resizeNV12Batch(d_inputNV12, g_ctx.ctx_pitch, g_ctx.width, g_ctx.height,
+                    d_resizedNV12, g_ctx.dst_width, g_ctx.dst_width,
+                    g_ctx.dst_height, g_ctx.batch);
+  }
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  printf(
+      "  CUDA resize nv12(%dx%d --> %dx%d), batch: %d,"
+      " average time: %.3f ms ==> %.3f ms/frame\n",
+      g_ctx.width, g_ctx.height, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch,
+      (elapsedTime / (TEST_LOOP * 1.0f)),
+      (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch);
+
+  sprintf(filename, "resized_nv12_%dx%d", g_ctx.dst_width, g_ctx.dst_height);
+
+  /* convert nv12 to bgr 3 progressive planars */
+  cudaEventRecord(start, 0);
+  for (int i = 0; i < TEST_LOOP; i++) {
+    nv12ToBGRplanarBatch(d_resizedNV12, g_ctx.dst_pitch,  // intput
+                         d_outputBGR,
+                         g_ctx.dst_pitch * sizeof(float),    // output
+                         g_ctx.dst_width, g_ctx.dst_height,  // output
+                         g_ctx.batch, 0);
+  }
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+
+  printf(
+      "  CUDA convert nv12(%dx%d) to bgr(%dx%d), batch: %d,"
+      " average time: %.3f ms ==> %.3f ms/frame\n",
+      g_ctx.dst_width, g_ctx.dst_height, g_ctx.dst_width, g_ctx.dst_height,
+      g_ctx.batch, (elapsedTime / (TEST_LOOP * 1.0f)),
+      (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch);
+
+  sprintf(filename, "converted_bgr_%dx%d", g_ctx.dst_width, g_ctx.dst_height);
+  dumpBGR(d_outputBGR, g_ctx.dst_pitch, g_ctx.dst_width, g_ctx.dst_height,
+          g_ctx.batch, (char *)"t1", filename);
+
+  /* release resources */
+  checkCudaErrors(cudaEventDestroy(start));
+  checkCudaErrors(cudaEventDestroy(stop));
+  checkCudaErrors(cudaStreamDestroy(stream));
+  checkCudaErrors(cudaFree(d_resizedNV12));
+  checkCudaErrors(cudaFree(d_outputBGR));
+}
+
+/*
+  1. convert nv12 to bgr 3 progressive planars
+  2. resize bgr 3 planars to target size
+*/
+void nv12ToBGRandBGRresize(unsigned char *d_inputNV12) {
+  float *d_bgr;
+  float *d_resizedBGR;
+  int size;
+  char filename[40];
+
+  /* allocate device memory for bgr output */
+  size = g_ctx.ctx_pitch * g_ctx.height * 3 * g_ctx.batch * sizeof(float);
+  checkCudaErrors(cudaMalloc((void **)&d_bgr, size));
+
+  /* allocate device memory for resized bgr output */
+  size = g_ctx.dst_width * g_ctx.dst_height * 3 * g_ctx.batch * sizeof(float);
+  checkCudaErrors(cudaMalloc((void **)&d_resizedBGR, size));
+
+  cudaStream_t stream;
+  checkCudaErrors(cudaStreamCreate(&stream));
+  /* create cuda event handles */
+  cudaEvent_t start, stop;
+  checkCudaErrors(cudaEventCreate(&start));
+  checkCudaErrors(cudaEventCreate(&stop));
+  float elapsedTime = 0.0f;
+
+  /* convert interlace nv12 to bgr 3 progressive planars */
+  cudaEventRecord(start, 0);
+  cudaDeviceSynchronize();
+  for (int i = 0; i < TEST_LOOP; i++) {
+    nv12ToBGRplanarBatch(d_inputNV12, g_ctx.ctx_pitch, d_bgr,
+                         g_ctx.ctx_pitch * sizeof(float), g_ctx.width,
+                         g_ctx.height, g_ctx.batch, 0);
+  }
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  printf(
+      "  CUDA convert nv12(%dx%d) to bgr(%dx%d), batch: %d,"
+      " average time: %.3f ms ==> %.3f ms/frame\n",
+      g_ctx.width, g_ctx.height, g_ctx.width, g_ctx.height, g_ctx.batch,
+      (elapsedTime / (TEST_LOOP * 1.0f)),
+      (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch);
+
+  sprintf(filename, "converted_bgr_%dx%d", g_ctx.width, g_ctx.height);
+
+  /* resize bgr 3 progressive planars */
+  cudaEventRecord(start, 0);
+  for (int i = 0; i < TEST_LOOP; i++) {
+    resizeBGRplanarBatch(d_bgr, g_ctx.ctx_pitch, g_ctx.width, g_ctx.height,
+                         d_resizedBGR, g_ctx.dst_width, g_ctx.dst_width,
+                         g_ctx.dst_height, g_ctx.batch);
+  }
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  printf(
+      "  CUDA resize bgr(%dx%d --> %dx%d), batch: %d,"
+      " average time: %.3f ms ==> %.3f ms/frame\n",
+      g_ctx.width, g_ctx.height, g_ctx.dst_width, g_ctx.dst_height, g_ctx.batch,
+      (elapsedTime / (TEST_LOOP * 1.0f)),
+      (elapsedTime / (TEST_LOOP * 1.0f)) / g_ctx.batch);
+
+  memset(filename, 0, sizeof(filename));
+  sprintf(filename, "resized_bgr_%dx%d", g_ctx.dst_width, g_ctx.dst_height);
+  dumpBGR(d_resizedBGR, g_ctx.dst_pitch, g_ctx.dst_width, g_ctx.dst_height,
+          g_ctx.batch, (char *)"t2", filename);
+
+  /* release resources */
+  checkCudaErrors(cudaEventDestroy(start));
+  checkCudaErrors(cudaEventDestroy(stop));
+  checkCudaErrors(cudaStreamDestroy(stream));
+  checkCudaErrors(cudaFree(d_bgr));
+  checkCudaErrors(cudaFree(d_resizedBGR));
+}
+
+int main(int argc, char *argv[]) {
+  unsigned char *d_inputNV12;
+
+  if (parseCmdLine(argc, argv) < 0) return EXIT_FAILURE;
+
+  g_ctx.ctx_pitch = g_ctx.width;
+  int ctx_alignment = 32;
+  g_ctx.ctx_pitch += (g_ctx.ctx_pitch % ctx_alignment != 0)
+                         ? (ctx_alignment - g_ctx.ctx_pitch % ctx_alignment)
+                         : 0;
+
+  g_ctx.ctx_heights = ceil(g_ctx.height * 3.0f / 2.0f);
+
+  /* load nv12 yuv data into d_inputNV12 with batch of copies */
+#if USE_UVM_MEM
+  checkCudaErrors(cudaMallocManaged(
+      (void **)&d_inputNV12,
+      (g_ctx.ctx_pitch * g_ctx.ctx_heights * g_ctx.batch), cudaMemAttachHost));
+  printf("\nUSE_UVM_MEM\n");
+#else
+  checkCudaErrors(
+      cudaMalloc((void **)&d_inputNV12,
+                 (g_ctx.ctx_pitch * g_ctx.ctx_heights * g_ctx.batch)));
+#endif
+  if (loadNV12Frame(d_inputNV12)) {
+    std::cerr << "failed to load batch data!\n";
+    return EXIT_FAILURE;
+  }
+
+  /* firstly resize nv12, then convert nv12 to bgr */
+  printf("\nTEST#1:\n");
+  nv12ResizeAndNV12ToBGR(d_inputNV12);
+
+  /* first convert nv12 to bgr, then resize bgr */
+  printf("\nTEST#2:\n");
+  nv12ToBGRandBGRresize(d_inputNV12);
+
+  checkCudaErrors(cudaFree(d_inputNV12));
+
+  return EXIT_SUCCESS;
+}
--- a/Samples/NV12toBGRandResize/utils.cu
+++ b/Samples/NV12toBGRandResize/utils.cu
@ -0,0 +1,152 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fstream>
+#include <iostream>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "resize_convert.h"
+#include "utils.h"
+
+__global__ void floatToChar(float *src, unsigned char *dst, int height,
+                            int width, int batchSize) {
+  int x = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (x >= height * width) return;
+
+  int offset = height * width * 3;
+
+  for (int j = 0; j < batchSize; j++) {
+    // b
+    *(dst + j * offset + x * 3 + 0) =
+        (unsigned char)*(src + j * offset + height * width * 0 + x);
+    // g
+    *(dst + j * offset + x * 3 + 1) =
+        (unsigned char)*(src + j * offset + height * width * 1 + x);
+    // r
+    *(dst + j * offset + x * 3 + 2) =
+        (unsigned char)*(src + j * offset + height * width * 2 + x);
+  }
+}
+
+void floatPlanarToChar(float *src, unsigned char *dst, int height, int width,
+                       int batchSize) {
+  floatToChar<<<(height * width - 1) / 1024 + 1, 1024, 0, NULL>>>(
+      src, dst, height, width, batchSize);
+}
+
+void dumpRawBGR(float *d_srcBGR, int pitch, int width, int height,
+                int batchSize, char *folder, char *tag) {
+  float *bgr, *d_bgr;
+  int frameSize;
+  char directory[120];
+  char mkdir_cmd[256];
+#if !defined(_WIN32)
+  sprintf(directory, "output/%s", folder);
+  sprintf(mkdir_cmd, "mkdir -p %s 2> /dev/null", directory);
+#else
+  sprintf(directory, "output\\%s", folder);
+  sprintf(mkdir_cmd, "mkdir %s 2> nul", directory);
+#endif
+
+  int ret = system(mkdir_cmd);
+
+  frameSize = width * height * 3 * sizeof(float);
+  bgr = (float *)malloc(frameSize);
+  if (bgr == NULL) {
+    std::cerr << "Failed malloc for bgr\n";
+    return;
+  }
+
+  d_bgr = d_srcBGR;
+  for (int i = 0; i < batchSize; i++) {
+    char filename[120];
+    std::ofstream *outputFile;
+
+    checkCudaErrors(cudaMemcpy((void *)bgr, (void *)d_bgr, frameSize,
+                               cudaMemcpyDeviceToHost));
+    sprintf(filename, "%s/%s_%d.raw", directory, tag, (i + 1));
+
+    outputFile = new std::ofstream(filename);
+    if (outputFile) {
+      outputFile->write((char *)bgr, frameSize);
+      delete outputFile;
+    }
+
+    d_bgr += pitch * height * 3;
+  }
+
+  free(bgr);
+}
+
+void dumpBGR(float *d_srcBGR, int pitch, int width, int height, int batchSize,
+             char *folder, char *tag) {
+  dumpRawBGR(d_srcBGR, pitch, width, height, batchSize, folder, tag);
+}
+
+void dumpYUV(unsigned char *d_nv12, int size, char *folder, char *tag) {
+  unsigned char *nv12Data;
+  std::ofstream *nv12File;
+  char filename[120];
+  char directory[60];
+  char mkdir_cmd[256];
+#if !defined(_WIN32)
+  sprintf(directory, "output/%s", folder);
+  sprintf(mkdir_cmd, "mkdir -p %s 2> /dev/null", directory);
+#else
+  sprintf(directory, "output\\%s", folder);
+  sprintf(mkdir_cmd, "mkdir %s 2> nul", directory);
+#endif
+
+  int ret = system(mkdir_cmd);
+
+  sprintf(filename, "%s/%s.nv12", directory, tag);
+
+  nv12File = new std::ofstream(filename);
+  if (nv12File == NULL) {
+    std::cerr << "Failed to new " << filename;
+    return;
+  }
+
+  nv12Data = (unsigned char *)malloc(size * (sizeof(char)));
+  if (nv12Data == NULL) {
+    std::cerr << "Failed to allcoate memory\n";
+    return;
+  }
+
+  cudaMemcpy((void *)nv12Data, (void *)d_nv12, size, cudaMemcpyDeviceToHost);
+
+  nv12File->write((const char *)nv12Data, size);
+
+  free(nv12Data);
+  delete nv12File;
+}
--- a/Samples/NV12toBGRandResize/utils.h
+++ b/Samples/NV12toBGRandResize/utils.h
@ -0,0 +1,37 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef __H_UTIL_
+#define __H_UTIL_
+
+extern "C"
+void dumpBGR(float *d_srcBGR, int pitch, int width, int height,
+              int batchSize, char *folder, char *tag);
+extern "C"
+void dumpYUV(unsigned char *d_nv12, int size, char *folder, char *tag);
+#endif
--- a/Samples/UnifiedMemoryPerf/Makefile
+++ b/Samples/UnifiedMemoryPerf/Makefile
@ -1,5 +1,5 @@
 ################################################################################
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
@ -19,12 +19,16 @@
    <ProjectName>UnifiedMemoryPerf</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
-	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.sln
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryPerf", "UnifiedMemoryPerf_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2019.vcxproj
@ -0,0 +1,111 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>UnifiedMemoryPerf_vs2019</RootNamespace>
+    <ProjectName>UnifiedMemoryPerf</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/UnifiedMemoryPerf.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="commonKernels.cu" />
+    <ClCompile Include="helperFunctions.cpp" />
+    <CudaCompile Include="matrixMultiplyPerf.cu" />
+    <ClInclude Include="commonDefs.hpp" />
+    <ClInclude Include="commonKernels.hpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/UnifiedMemoryPerf/commonDefs.hpp
+++ b/Samples/UnifiedMemoryPerf/commonDefs.hpp
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Samples/UnifiedMemoryPerf/commonKernels.cu
+++ b/Samples/UnifiedMemoryPerf/commonKernels.cu
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Samples/UnifiedMemoryPerf/commonKernels.hpp
+++ b/Samples/UnifiedMemoryPerf/commonKernels.hpp
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Samples/UnifiedMemoryPerf/helperFunctions.cpp
+++ b/Samples/UnifiedMemoryPerf/helperFunctions.cpp
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Samples/UnifiedMemoryPerf/matrixMultiplyPerf.cu
+++ b/Samples/UnifiedMemoryPerf/matrixMultiplyPerf.cu
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Samples/bandwidthTest/Makefile
+++ b/Samples/bandwidthTest/Makefile
@ -1,5 +1,5 @@
 ################################################################################
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
--- a/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
@ -19,12 +19,16 @@
    <ProjectName>bandwidthTest</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
-	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
--- a/Samples/bandwidthTest/bandwidthTest_vs2019.sln
+++ b/Samples/bandwidthTest/bandwidthTest_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj
+++ b/Samples/bandwidthTest/bandwidthTest_vs2019.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>bandwidthTest_vs2019</RootNamespace>
+    <ProjectName>bandwidthTest</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bandwidthTest.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/conjugateGradientCudaGraphs/Makefile
+++ b/Samples/conjugateGradientCudaGraphs/Makefile
@ -1,5 +1,5 @@
 ################################################################################
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
@ -19,12 +19,16 @@
    <ProjectName>conjugateGradientCudaGraphs</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
-	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.sln
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2019.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>conjugateGradientCudaGraphs_vs2019</RootNamespace>
+    <ProjectName>conjugateGradientCudaGraphs</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="conjugateGradientCudaGraphs.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/conjugateGradientMultiBlockCG/Makefile
+++ b/Samples/conjugateGradientMultiBlockCG/Makefile
@ -1,5 +1,5 @@
 ################################################################################
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
@ -19,12 +19,16 @@
    <ProjectName>conjugateGradientMultiBlockCG</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
-	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.sln
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiBlockCG", "conjugateGradientMultiBlockCG_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2019.vcxproj
@ -0,0 +1,109 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>conjugateGradientMultiBlockCG_vs2019</RootNamespace>
+    <ProjectName>conjugateGradientMultiBlockCG</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/conjugateGradientMultiBlockCG.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+      <GenerateRelocatableDeviceCode>true</GenerateRelocatableDeviceCode>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="conjugateGradientMultiBlockCG.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/conjugateGradientMultiDeviceCG/Makefile
+++ b/Samples/conjugateGradientMultiDeviceCG/Makefile
@ -1,5 +1,5 @@
 ################################################################################
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@ -286,7 +286,7 @@ GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif

-ALL_CCFLAGS += -dc
+ALL_CCFLAGS += -dc -maxrregcount=64

 LIBRARIES += -lcudadevrt

--- a/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml
+++ b/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml
@ -4,6 +4,7 @@
  <name>conjugateGradientMultiDeviceCG</name>
  <cflags>
    <flag>-dc</flag>
+    <flag>-maxrregcount=64</flag>
  </cflags>
  <cuda_api_list>
    <toolkit>cudaMemAdvise</toolkit>
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
@ -1,4 +1,4 @@
-/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
@ -19,12 +19,16 @@
    <ProjectName>conjugateGradientMultiDeviceCG</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
-	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.sln
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiDeviceCG", "conjugateGradientMultiDeviceCG_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2019.vcxproj
@ -0,0 +1,109 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>conjugateGradientMultiDeviceCG_vs2019</RootNamespace>
+    <ProjectName>conjugateGradientMultiDeviceCG</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/conjugateGradientMultiDeviceCG.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+      <GenerateRelocatableDeviceCode>true</GenerateRelocatableDeviceCode>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="conjugateGradientMultiDeviceCG.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/cuSolverDn_LinearSolver/Makefile
+++ b/Samples/cuSolverDn_LinearSolver/Makefile
@ -0,0 +1,299 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+endif
+
+ifeq ($(TARGET_OS),qnx)
+    CCFLAGS += -DWIN_INTERFACE_CUSTOM
+    LDFLAGS += -lsocket
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - cuSolverDn_LinearSolver is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(TARGET_OS),linux)
+ALL_CCFLAGS += -Xcompiler \"-Wl,--no-as-needed\"
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+LIBRARIES += -lcusolver -lcublas -lcusparse
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: cuSolverDn_LinearSolver
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+cuSolverDn_LinearSolver.o:cuSolverDn_LinearSolver.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+mmio.c.o:mmio.c
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+mmio_wrapper.o:mmio_wrapper.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+cuSolverDn_LinearSolver: cuSolverDn_LinearSolver.o mmio.c.o mmio_wrapper.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./cuSolverDn_LinearSolver
+
+clean:
+	rm -f cuSolverDn_LinearSolver cuSolverDn_LinearSolver.o mmio.c.o mmio_wrapper.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cuSolverDn_LinearSolver
+
+clobber: clean
--- a/Samples/cuSolverDn_LinearSolver/README.md
+++ b/Samples/cuSolverDn_LinearSolver/README.md
@ -0,0 +1,95 @@
+# cuSolverDn_LinearSolver - cuSolverDn Linear Solver
+
+## Description
+
+A CUDA Sample that demonstrates cuSolverDN's LU, QR and Cholesky factorization.
+
+## Key Concepts
+
+Linear Algebra, CUSOLVER Library
+
+## Supported SM Architectures
+
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows, MacOSX
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, aarch64
+
+## CUDA APIs involved
+
+## Dependencies needed to build/run
+[CUSOLVER](../../README.md#cusolver), [CUBLAS](../../README.md#cublas), [CUSPARSE](../../README.md#cusparse)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+### Mac
+The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+
+The samples makefiles can take advantage of certain options:
+
+*  **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+
+*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
+    ```
+    $ make SMS="A B ..."
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
+    ```
+    $ make HOST_COMPILER=clang
+    ```
+
+## References (for more details)
+
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver.cpp
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver.cpp
@ -0,0 +1,584 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ *  Test three linear solvers, including Cholesky, LU and QR.
+ *  The user has to prepare a sparse matrix of "matrix market format" (with
+ * extension .mtx). For example, the user can download matrices in Florida
+ * Sparse Matrix Collection.
+ *  (http://www.cise.ufl.edu/research/sparse/matrices/)
+ *
+ *  The user needs to choose a solver by switch -R<solver> and
+ *  to provide the path of the matrix by switch -F<file>, then
+ *  the program solves
+ *          A*x = b  where b = ones(m,1)
+ *  and reports relative error
+ *          |b-A*x|/(|A|*|x|)
+ *
+ *  The elapsed time is also reported so the user can compare efficiency of
+ * different solvers.
+ *
+ *  How to use
+ *      ./cuSolverDn_LinearSolver                     // Default: cholesky
+ *     ./cuSolverDn_LinearSolver -R=chol -filefile>   // cholesky factorization
+ *     ./cuSolverDn_LinearSolver -R=lu -file<file>     // LU with partial
+ * pivoting
+ *     ./cuSolverDn_LinearSolver -R=qr -file<file>     // QR factorization
+ *
+ *  Remark: the absolute error on solution x is meaningless without knowing
+ * condition number of A. The relative error on residual should be close to
+ * machine zero, i.e. 1.e-15.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cuda_runtime.h>
+
+#include "cublas_v2.h"
+#include "cusolverDn.h"
+#include "helper_cuda.h"
+
+#include "helper_cusolver.h"
+
+template <typename T_ELEM>
+int loadMMSparseMatrix(char *filename, char elem_type, bool csrFormat, int *m,
+                       int *n, int *nnz, T_ELEM **aVal, int **aRowInd,
+                       int **aColInd, int extendSymMatrix);
+
+void UsageDN(void) {
+  printf("<options>\n");
+  printf("-h          : display this help\n");
+  printf("-R=<name>    : choose a linear solver\n");
+  printf("              chol (cholesky factorization), this is default\n");
+  printf("              qr   (QR factorization)\n");
+  printf("              lu   (LU factorization)\n");
+  printf("-lda=<int> : leading dimension of A , m by default\n");
+  printf("-file=<filename>: filename containing a matrix in MM format\n");
+  printf("-device=<device_id> : <device_id> if want to run on specific GPU\n");
+
+  exit(0);
+}
+
+/*
+ *  solve A*x = b by Cholesky factorization
+ *
+ */
+int linearSolverCHOL(cusolverDnHandle_t handle, int n, const double *Acopy,
+                     int lda, const double *b, double *x) {
+  int bufferSize = 0;
+  int *info = NULL;
+  double *buffer = NULL;
+  double *A = NULL;
+  int h_info = 0;
+  double start, stop;
+  double time_solve;
+  cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+
+  checkCudaErrors(cusolverDnDpotrf_bufferSize(handle, uplo, n, (double *)Acopy,
+                                              lda, &bufferSize));
+
+  checkCudaErrors(cudaMalloc(&info, sizeof(int)));
+  checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize));
+  checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n));
+
+  // prepare a copy of A because potrf will overwrite A with L
+  checkCudaErrors(
+      cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice));
+  checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
+
+  start = second();
+  start = second();
+
+  checkCudaErrors(
+      cusolverDnDpotrf(handle, uplo, n, A, lda, buffer, bufferSize, info));
+
+  checkCudaErrors(
+      cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
+
+  if (0 != h_info) {
+    fprintf(stderr, "Error: Cholesky factorization failed\n");
+  }
+
+  checkCudaErrors(
+      cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice));
+
+  checkCudaErrors(cusolverDnDpotrs(handle, uplo, n, 1, A, lda, x, n, info));
+
+  checkCudaErrors(cudaDeviceSynchronize());
+  stop = second();
+
+  time_solve = stop - start;
+  fprintf(stdout, "timing: cholesky = %10.6f sec\n", time_solve);
+
+  if (info) {
+    checkCudaErrors(cudaFree(info));
+  }
+  if (buffer) {
+    checkCudaErrors(cudaFree(buffer));
+  }
+  if (A) {
+    checkCudaErrors(cudaFree(A));
+  }
+
+  return 0;
+}
+
+/*
+ *  solve A*x = b by LU with partial pivoting
+ *
+ */
+int linearSolverLU(cusolverDnHandle_t handle, int n, const double *Acopy,
+                   int lda, const double *b, double *x) {
+  int bufferSize = 0;
+  int *info = NULL;
+  double *buffer = NULL;
+  double *A = NULL;
+  int *ipiv = NULL;  // pivoting sequence
+  int h_info = 0;
+  double start, stop;
+  double time_solve;
+
+  checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n, (double *)Acopy,
+                                              lda, &bufferSize));
+
+  checkCudaErrors(cudaMalloc(&info, sizeof(int)));
+  checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize));
+  checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n));
+  checkCudaErrors(cudaMalloc(&ipiv, sizeof(int) * n));
+
+  // prepare a copy of A because getrf will overwrite A with L
+  checkCudaErrors(
+      cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice));
+  checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
+
+  start = second();
+  start = second();
+
+  checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
+  checkCudaErrors(
+      cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
+
+  if (0 != h_info) {
+    fprintf(stderr, "Error: LU factorization failed\n");
+  }
+
+  checkCudaErrors(
+      cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice));
+  checkCudaErrors(
+      cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
+  checkCudaErrors(cudaDeviceSynchronize());
+  stop = second();
+
+  time_solve = stop - start;
+  fprintf(stdout, "timing: LU = %10.6f sec\n", time_solve);
+
+  if (info) {
+    checkCudaErrors(cudaFree(info));
+  }
+  if (buffer) {
+    checkCudaErrors(cudaFree(buffer));
+  }
+  if (A) {
+    checkCudaErrors(cudaFree(A));
+  }
+  if (ipiv) {
+    checkCudaErrors(cudaFree(ipiv));
+  }
+
+  return 0;
+}
+
+/*
+ *  solve A*x = b by QR
+ *
+ */
+int linearSolverQR(cusolverDnHandle_t handle, int n, const double *Acopy,
+                   int lda, const double *b, double *x) {
+  cublasHandle_t cublasHandle = NULL;  // used in residual evaluation
+  int bufferSize = 0;
+  int bufferSize_geqrf = 0;
+  int bufferSize_ormqr = 0;
+  int *info = NULL;
+  double *buffer = NULL;
+  double *A = NULL;
+  double *tau = NULL;
+  int h_info = 0;
+  double start, stop;
+  double time_solve;
+  const double one = 1.0;
+
+  checkCudaErrors(cublasCreate(&cublasHandle));
+
+  checkCudaErrors(cusolverDnDgeqrf_bufferSize(handle, n, n, (double *)Acopy,
+                                              lda, &bufferSize_geqrf));
+  checkCudaErrors(cusolverDnDormqr_bufferSize(handle, CUBLAS_SIDE_LEFT,
+                                              CUBLAS_OP_T, n, 1, n, A, lda,
+                                              NULL, x, n, &bufferSize_ormqr));
+
+  printf("buffer_geqrf = %d, buffer_ormqr = %d \n", bufferSize_geqrf,
+         bufferSize_ormqr);
+
+  bufferSize = (bufferSize_geqrf > bufferSize_ormqr) ? bufferSize_geqrf
+                                                     : bufferSize_ormqr;
+
+  checkCudaErrors(cudaMalloc(&info, sizeof(int)));
+  checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize));
+  checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n));
+  checkCudaErrors(cudaMalloc((void **)&tau, sizeof(double) * n));
+
+  // prepare a copy of A because getrf will overwrite A with L
+  checkCudaErrors(
+      cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice));
+
+  checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
+
+  start = second();
+  start = second();
+
+  // compute QR factorization
+  checkCudaErrors(
+      cusolverDnDgeqrf(handle, n, n, A, lda, tau, buffer, bufferSize, info));
+
+  checkCudaErrors(
+      cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
+
+  if (0 != h_info) {
+    fprintf(stderr, "Error: LU factorization failed\n");
+  }
+
+  checkCudaErrors(
+      cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice));
+
+  // compute Q^T*b
+  checkCudaErrors(cusolverDnDormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, n, 1,
+                                   n, A, lda, tau, x, n, buffer, bufferSize,
+                                   info));
+
+  // x = R \ Q^T*b
+  checkCudaErrors(cublasDtrsm(cublasHandle, CUBLAS_SIDE_LEFT,
+                              CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
+                              CUBLAS_DIAG_NON_UNIT, n, 1, &one, A, lda, x, n));
+  checkCudaErrors(cudaDeviceSynchronize());
+  stop = second();
+
+  time_solve = stop - start;
+  fprintf(stdout, "timing: QR = %10.6f sec\n", time_solve);
+
+  if (cublasHandle) {
+    checkCudaErrors(cublasDestroy(cublasHandle));
+  }
+  if (info) {
+    checkCudaErrors(cudaFree(info));
+  }
+  if (buffer) {
+    checkCudaErrors(cudaFree(buffer));
+  }
+  if (A) {
+    checkCudaErrors(cudaFree(A));
+  }
+  if (tau) {
+    checkCudaErrors(cudaFree(tau));
+  }
+
+  return 0;
+}
+
+void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) {
+  memset(&opts, 0, sizeof(opts));
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "-h")) {
+    UsageDN();
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "R")) {
+    char *solverType = NULL;
+    getCmdLineArgumentString(argc, (const char **)argv, "R", &solverType);
+
+    if (solverType) {
+      if ((STRCASECMP(solverType, "chol") != 0) &&
+          (STRCASECMP(solverType, "lu") != 0) &&
+          (STRCASECMP(solverType, "qr") != 0)) {
+        printf("\nIncorrect argument passed to -R option\n");
+        UsageDN();
+      } else {
+        opts.testFunc = solverType;
+      }
+    }
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
+    char *fileName = 0;
+    getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName);
+
+    if (fileName) {
+      opts.sparse_mat_filename = fileName;
+    } else {
+      printf("\nIncorrect filename passed to -file \n ");
+      UsageDN();
+    }
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "lda")) {
+    opts.lda = getCmdLineArgumentInt(argc, (const char **)argv, "lda");
+  }
+}
+
+int main(int argc, char *argv[]) {
+  struct testOpts opts;
+  cusolverDnHandle_t handle = NULL;
+  cublasHandle_t cublasHandle = NULL;  // used in residual evaluation
+  cudaStream_t stream = NULL;
+
+  int rowsA = 0;  // number of rows of A
+  int colsA = 0;  // number of columns of A
+  int nnzA = 0;   // number of nonzeros of A
+  int baseA = 0;  // base index in CSR format
+  int lda = 0;    // leading dimension in dense matrix
+
+  // CSR(A) from I/O
+  int *h_csrRowPtrA = NULL;
+  int *h_csrColIndA = NULL;
+  double *h_csrValA = NULL;
+
+  double *h_A = NULL;  // dense matrix from CSR(A)
+  double *h_x = NULL;  // a copy of d_x
+  double *h_b = NULL;  // b = ones(m,1)
+  double *h_r = NULL;  // r = b - A*x, a copy of d_r
+
+  double *d_A = NULL;  // a copy of h_A
+  double *d_x = NULL;  // x = A \ b
+  double *d_b = NULL;  // a copy of h_b
+  double *d_r = NULL;  // r = b - A*x
+
+  // the constants are used in residual evaluation, r = b - A*x
+  const double minus_one = -1.0;
+  const double one = 1.0;
+
+  double x_inf = 0.0;
+  double r_inf = 0.0;
+  double A_inf = 0.0;
+  int errors = 0;
+
+  parseCommandLineArguments(argc, argv, opts);
+
+  if (NULL == opts.testFunc) {
+    opts.testFunc = "chol";  // By default running Cholesky as NO solver
+                             // selected with -R option.
+  }
+
+  findCudaDevice(argc, (const char **)argv);
+
+  printf("step 1: read matrix market format\n");
+
+  if (opts.sparse_mat_filename == NULL) {
+    opts.sparse_mat_filename = sdkFindFilePath("gr_900_900_crg.mtx", argv[0]);
+    if (opts.sparse_mat_filename != NULL)
+      printf("Using default input file [%s]\n", opts.sparse_mat_filename);
+    else
+      printf("Could not find gr_900_900_crg.mtx\n");
+  } else {
+    printf("Using input file [%s]\n", opts.sparse_mat_filename);
+  }
+
+  if (opts.sparse_mat_filename == NULL) {
+    fprintf(stderr, "Error: input matrix is not provided\n");
+    return EXIT_FAILURE;
+  }
+
+  if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true, &rowsA,
+                                 &colsA, &nnzA, &h_csrValA, &h_csrRowPtrA,
+                                 &h_csrColIndA, true)) {
+    exit(EXIT_FAILURE);
+  }
+  baseA = h_csrRowPtrA[0];  // baseA = {0,1}
+
+  printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA,
+         nnzA, baseA);
+
+  if (rowsA != colsA) {
+    fprintf(stderr, "Error: only support square matrix\n");
+    exit(EXIT_FAILURE);
+  }
+
+  printf("step 2: convert CSR(A) to dense matrix\n");
+
+  lda = opts.lda ? opts.lda : rowsA;
+  if (lda < rowsA) {
+    fprintf(stderr, "Error: lda must be greater or equal to dimension of A\n");
+    exit(EXIT_FAILURE);
+  }
+
+  h_A = (double *)malloc(sizeof(double) * lda * colsA);
+  h_x = (double *)malloc(sizeof(double) * colsA);
+  h_b = (double *)malloc(sizeof(double) * rowsA);
+  h_r = (double *)malloc(sizeof(double) * rowsA);
+  assert(NULL != h_A);
+  assert(NULL != h_x);
+  assert(NULL != h_b);
+  assert(NULL != h_r);
+
+  memset(h_A, 0, sizeof(double) * lda * colsA);
+
+  for (int row = 0; row < rowsA; row++) {
+    const int start = h_csrRowPtrA[row] - baseA;
+    const int end = h_csrRowPtrA[row + 1] - baseA;
+    for (int colidx = start; colidx < end; colidx++) {
+      const int col = h_csrColIndA[colidx] - baseA;
+      const double Areg = h_csrValA[colidx];
+      h_A[row + col * lda] = Areg;
+    }
+  }
+
+  printf("step 3: set right hand side vector (b) to 1\n");
+  for (int row = 0; row < rowsA; row++) {
+    h_b[row] = 1.0;
+  }
+
+  // verify if A is symmetric or not.
+  if (0 == strcmp(opts.testFunc, "chol")) {
+    int issym = 1;
+    for (int j = 0; j < colsA; j++) {
+      for (int i = j; i < rowsA; i++) {
+        double Aij = h_A[i + j * lda];
+        double Aji = h_A[j + i * lda];
+        if (Aij != Aji) {
+          issym = 0;
+          break;
+        }
+      }
+    }
+    if (!issym) {
+      printf("Error: A has no symmetric pattern, please use LU or QR \n");
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  checkCudaErrors(cusolverDnCreate(&handle));
+  checkCudaErrors(cublasCreate(&cublasHandle));
+  checkCudaErrors(cudaStreamCreate(&stream));
+
+  checkCudaErrors(cusolverDnSetStream(handle, stream));
+  checkCudaErrors(cublasSetStream(cublasHandle, stream));
+
+  checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double) * lda * colsA));
+  checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA));
+  checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA));
+  checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA));
+
+  printf("step 4: prepare data on device\n");
+  checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double) * lda * colsA,
+                             cudaMemcpyHostToDevice));
+  checkCudaErrors(
+      cudaMemcpy(d_b, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice));
+
+  printf("step 5: solve A*x = b \n");
+  // d_A and d_b are read-only
+  if (0 == strcmp(opts.testFunc, "chol")) {
+    linearSolverCHOL(handle, rowsA, d_A, lda, d_b, d_x);
+  } else if (0 == strcmp(opts.testFunc, "lu")) {
+    linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
+  } else if (0 == strcmp(opts.testFunc, "qr")) {
+    linearSolverQR(handle, rowsA, d_A, lda, d_b, d_x);
+  } else {
+    fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc);
+    exit(EXIT_FAILURE);
+  }
+  printf("step 6: evaluate residual\n");
+  checkCudaErrors(
+      cudaMemcpy(d_r, d_b, sizeof(double) * rowsA, cudaMemcpyDeviceToDevice));
+
+  // r = b - A*x
+  checkCudaErrors(cublasDgemm_v2(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, rowsA,
+                                 1, colsA, &minus_one, d_A, lda, d_x, rowsA,
+                                 &one, d_r, rowsA));
+
+  checkCudaErrors(
+      cudaMemcpy(h_x, d_x, sizeof(double) * colsA, cudaMemcpyDeviceToHost));
+  checkCudaErrors(
+      cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost));
+
+  x_inf = vec_norminf(colsA, h_x);
+  r_inf = vec_norminf(rowsA, h_r);
+  A_inf = mat_norminf(rowsA, colsA, h_A, lda);
+
+  printf("|b - A*x| = %E \n", r_inf);
+  printf("|A| = %E \n", A_inf);
+  printf("|x| = %E \n", x_inf);
+  printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf));
+
+  if (handle) {
+    checkCudaErrors(cusolverDnDestroy(handle));
+  }
+  if (cublasHandle) {
+    checkCudaErrors(cublasDestroy(cublasHandle));
+  }
+  if (stream) {
+    checkCudaErrors(cudaStreamDestroy(stream));
+  }
+
+  if (h_csrValA) {
+    free(h_csrValA);
+  }
+  if (h_csrRowPtrA) {
+    free(h_csrRowPtrA);
+  }
+  if (h_csrColIndA) {
+    free(h_csrColIndA);
+  }
+
+  if (h_A) {
+    free(h_A);
+  }
+  if (h_x) {
+    free(h_x);
+  }
+  if (h_b) {
+    free(h_b);
+  }
+  if (h_r) {
+    free(h_r);
+  }
+
+  if (d_A) {
+    checkCudaErrors(cudaFree(d_A));
+  }
+  if (d_x) {
+    checkCudaErrors(cudaFree(d_x));
+  }
+  if (d_b) {
+    checkCudaErrors(cudaFree(d_b));
+  }
+  if (d_r) {
+    checkCudaErrors(cudaFree(d_r));
+  }
+
+  return 0;
+}
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2012.sln
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2012.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuSolverDn_LinearSolver", "cuSolverDn_LinearSolver_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2012.vcxproj
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2012.vcxproj
@ -0,0 +1,109 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>cuSolverDn_LinearSolver_vs2012</RootNamespace>
+    <ProjectName>cuSolverDn_LinearSolver</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cusolver.lib;cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/cuSolverDn_LinearSolver.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration></CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="cuSolverDn_LinearSolver.cpp" />
+    <ClCompile Include="mmio.c" />
+    <ClCompile Include="mmio_wrapper.cpp" />
+    <ClInclude Include="mmio.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2013.sln
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2013.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 13.00
+# Visual Studio 2013
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuSolverDn_LinearSolver", "cuSolverDn_LinearSolver_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2013.vcxproj
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2013.vcxproj
@ -0,0 +1,109 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>cuSolverDn_LinearSolver_vs2013</RootNamespace>
+    <ProjectName>cuSolverDn_LinearSolver</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cusolver.lib;cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/cuSolverDn_LinearSolver.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration></CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="cuSolverDn_LinearSolver.cpp" />
+    <ClCompile Include="mmio.c" />
+    <ClCompile Include="mmio_wrapper.cpp" />
+    <ClInclude Include="mmio.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2015.sln
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2015.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuSolverDn_LinearSolver", "cuSolverDn_LinearSolver_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2015.vcxproj
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2015.vcxproj
@ -0,0 +1,109 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>cuSolverDn_LinearSolver_vs2015</RootNamespace>
+    <ProjectName>cuSolverDn_LinearSolver</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cusolver.lib;cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/cuSolverDn_LinearSolver.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration></CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="cuSolverDn_LinearSolver.cpp" />
+    <ClCompile Include="mmio.c" />
+    <ClCompile Include="mmio_wrapper.cpp" />
+    <ClInclude Include="mmio.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.sln
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuSolverDn_LinearSolver", "cuSolverDn_LinearSolver_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2017.vcxproj
@ -0,0 +1,114 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>cuSolverDn_LinearSolver_vs2017</RootNamespace>
+    <ProjectName>cuSolverDn_LinearSolver</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cusolver.lib;cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/cuSolverDn_LinearSolver.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration></CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="cuSolverDn_LinearSolver.cpp" />
+    <ClCompile Include="mmio.c" />
+    <ClCompile Include="mmio_wrapper.cpp" />
+    <ClInclude Include="mmio.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.sln
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cuSolverDn_LinearSolver", "cuSolverDn_LinearSolver_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj
+++ b/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver_vs2019.vcxproj
@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>cuSolverDn_LinearSolver_vs2019</RootNamespace>
+    <ProjectName>cuSolverDn_LinearSolver</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cusolver.lib;cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/cuSolverDn_LinearSolver.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration></CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="cuSolverDn_LinearSolver.cpp" />
+    <ClCompile Include="mmio.c" />
+    <ClCompile Include="mmio_wrapper.cpp" />
+    <ClInclude Include="mmio.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/cuSolverDn_LinearSolver/gr_900_900_crg.mtx
+++ b/Samples/cuSolverDn_LinearSolver/gr_900_900_crg.mtx
--- a/Samples/cuSolverDn_LinearSolver/lap3D_7pt_n20.mtx
+++ b/Samples/cuSolverDn_LinearSolver/lap3D_7pt_n20.mtx
--- a/Samples/cuSolverDn_LinearSolver/mmio.c
+++ b/Samples/cuSolverDn_LinearSolver/mmio.c
@ -0,0 +1,521 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+/* avoid Windows warnings (for example: strcpy, fscanf, etc.) */
+#if defined(_WIN32)  
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+#include "mmio.h"
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_)
+{
+    FILE *f;
+    MM_typecode matcode;
+    int M, N, nz;
+    int i;
+    double *val;
+    int *I, *J;
+ 
+    if ((f = fopen(fname, "r")) == NULL)
+            return -1;
+ 
+ 
+    if (mm_read_banner(f, &matcode) != 0)
+    {
+        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
+        printf(" in file [%s]\n", fname);
+        return -1;
+    }
+ 
+ 
+ 
+    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
+            mm_is_sparse(matcode)))
+    {
+        fprintf(stderr, "Sorry, this application does not support ");
+        fprintf(stderr, "Market Market type: [%s]\n",
+                mm_typecode_to_str(matcode));
+        return -1;
+    }
+ 
+    /* find out size of sparse matrix: M, N, nz .... */
+ 
+    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
+    {
+        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
+        return -1;
+    }
+ 
+    *M_ = M;
+    *N_ = N;
+    *nz_ = nz;
+ 
+    /* reseve memory for matrices */
+ 
+    I = (int *) malloc(nz * sizeof(int));
+    J = (int *) malloc(nz * sizeof(int));
+    val = (double *) malloc(nz * sizeof(double));
+ 
+    *val_ = val;
+    *I_ = I;
+    *J_ = J;
+ 
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+ 
+    for (i=0; i<nz; i++)
+    {
+        if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]) != 3) {
+            return -1;
+        }
+        I[i]--;  /* adjust from 1-based to 0-based */
+        J[i]--;
+    }
+    fclose(f);
+ 
+    return 0;
+}
+
+int mm_is_valid(MM_typecode matcode)
+{
+    if (!mm_is_matrix(matcode)) return 0;
+    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
+    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
+    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
+                mm_is_skew(matcode))) return 0;
+    return 1;
+}
+
+int mm_read_banner(FILE *f, MM_typecode *matcode)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    char banner[MM_MAX_TOKEN_LENGTH];
+    char mtx[MM_MAX_TOKEN_LENGTH]; 
+    char crd[MM_MAX_TOKEN_LENGTH];
+    char data_type[MM_MAX_TOKEN_LENGTH];
+    char storage_scheme[MM_MAX_TOKEN_LENGTH];
+    char *p;
+
+
+    mm_clear_typecode(matcode);  
+
+    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
+        return MM_PREMATURE_EOF;
+
+    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
+        storage_scheme) != 5)
+        return MM_PREMATURE_EOF;
+
+    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
+    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
+    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
+    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
+
+    /* check for banner */
+    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
+        return MM_NO_HEADER;
+
+    /* first field should be "mtx" */
+    if (strcmp(mtx, MM_MTX_STR) != 0)
+        return  MM_UNSUPPORTED_TYPE;
+    mm_set_matrix(matcode);
+
+
+    /* second field describes whether this is a sparse matrix (in coordinate
+            storgae) or a dense array */
+
+
+    if (strcmp(crd, MM_SPARSE_STR) == 0)
+        mm_set_sparse(matcode);
+    else
+    if (strcmp(crd, MM_DENSE_STR) == 0)
+            mm_set_dense(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* third field */
+
+    if (strcmp(data_type, MM_REAL_STR) == 0)
+        mm_set_real(matcode);
+    else
+    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
+        mm_set_complex(matcode);
+    else
+    if (strcmp(data_type, MM_PATTERN_STR) == 0)
+        mm_set_pattern(matcode);
+    else
+    if (strcmp(data_type, MM_INT_STR) == 0)
+        mm_set_integer(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* fourth field */
+
+    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
+        mm_set_general(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
+        mm_set_symmetric(matcode);
+    else
+    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
+        mm_set_hermitian(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
+        mm_set_skew(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+        
+
+    return 0;
+}
+
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
+{
+    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = *nz = 0;
+
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
+        return 0;
+        
+    else
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 3);
+
+    return 0;
+}
+
+
+int mm_read_mtx_array_size(FILE *f, int *M, int *N)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = 0;
+	
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d", M, N) == 2)
+        return 0;
+        
+    else /* we have a blank line */
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d", M, N); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 2);
+
+    return 0;
+}
+
+int mm_write_mtx_array_size(FILE *f, int M, int N)
+{
+    if (fprintf(f, "%d %d\n", M, N) != 2)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+
+
+/*-------------------------------------------------------------------------*/
+
+/******************************************************************/
+/* use when I[], J[], and val[]J, and val[] are already allocated */
+/******************************************************************/
+
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    int i;
+    if (mm_is_complex(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode) || mm_is_integer(matcode))
+    {
+        for (i=0; i<nz; i++)
+        {
+            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
+                != 3) return MM_PREMATURE_EOF;
+
+        }
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d", &I[i], &J[i])
+                != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
+        double *real, double *imag, MM_typecode matcode)
+{
+    if (mm_is_complex(matcode))
+    {
+            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode) || mm_is_integer(matcode))
+    {
+            if (fscanf(f, "%d %d %lg\n", I, J, real)
+                != 3) return MM_PREMATURE_EOF;
+
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+
+/************************************************************************
+    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
+                        type code, e.g. 'MCRS'
+
+                        if matrix is complex, values[] is of size 2*nz,
+                            (nz pairs of real/imaginary values)
+************************************************************************/
+
+int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
+        double **val, MM_typecode *matcode)
+{
+    int ret_code;
+    FILE *f;
+
+    if (strcmp(fname, "stdin") == 0) f=stdin;
+    else
+    if ((f = fopen(fname, "r")) == NULL)
+        return MM_COULD_NOT_READ_FILE;
+
+
+    if ((ret_code = mm_read_banner(f, matcode)) != 0)
+        return ret_code;
+
+    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
+            mm_is_matrix(*matcode)))
+        return MM_UNSUPPORTED_TYPE;
+
+    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
+        return ret_code;
+
+
+    *I = (int *)  malloc(*nz * sizeof(int));
+    *J = (int *)  malloc(*nz * sizeof(int));
+    *val = NULL;
+
+    if (mm_is_complex(*matcode))
+    {
+        *val = (double *) malloc(*nz * 2 * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+    else if (mm_is_real(*matcode) || mm_is_integer(*matcode))
+    {
+        *val = (double *) malloc(*nz * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    else if (mm_is_pattern(*matcode))
+    {
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    if (f != stdin) fclose(f);
+    return 0;
+}
+
+int mm_write_banner(FILE *f, MM_typecode matcode)
+{
+    char *str = mm_typecode_to_str(matcode);
+    int ret_code;
+
+    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
+    free(str);
+    if (ret_code !=2 )
+        return MM_COULD_NOT_WRITE_FILE;
+    else
+        return 0;
+}
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    FILE *f;
+    int i;
+
+    if (strcmp(fname, "stdout") == 0) 
+        f = stdout;
+    else
+    if ((f = fopen(fname, "w")) == NULL)
+        return MM_COULD_NOT_WRITE_FILE;
+    
+    /* print banner followed by typecode */
+    fprintf(f, "%s ", MatrixMarketBanner);
+    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
+
+    /* print matrix sizes and nonzeros */
+    fprintf(f, "%d %d %d\n", M, N, nz);
+
+    /* print values */
+    if (mm_is_pattern(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d\n", I[i], J[i]);
+    else
+    if (mm_is_integer(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %d\n", I[i], J[i], (int)val[i]);
+    else
+    if (mm_is_real(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
+    else
+    if (mm_is_complex(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
+                        val[2*i+1]);
+    else
+    {
+        if (f != stdout) fclose(f);
+        return MM_UNSUPPORTED_TYPE;
+    }
+
+    if (f !=stdout) fclose(f);
+
+    return 0;
+}
+  
+
+/**
+*  Create a new copy of a string s.  mm_strdup() is a common routine, but
+*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
+*
+*/
+static char *mm_strdup(const char *s)
+{
+	size_t len = strlen(s);
+	char *s2 = (char *) malloc((len+1)*sizeof(char));
+	return strcpy(s2, s);
+}
+
+char  *mm_typecode_to_str(MM_typecode matcode)
+{
+    char buffer[MM_MAX_LINE_LENGTH];
+    char *types[4];
+	//char *mm_strdup(const char *);
+    //int error =0;
+
+    /* check for MTX type */
+    if (mm_is_matrix(matcode)) 
+        types[0] = MM_MTX_STR;
+    else
+        return NULL; //  error=1;
+
+    /* check for CRD or ARR matrix */
+    if (mm_is_sparse(matcode))
+        types[1] = MM_SPARSE_STR;
+    else
+    if (mm_is_dense(matcode))
+        types[1] = MM_DENSE_STR;
+    else
+        return NULL;
+
+    /* check for element data type */
+    if (mm_is_real(matcode))
+        types[2] = MM_REAL_STR;
+    else
+    if (mm_is_complex(matcode))
+        types[2] = MM_COMPLEX_STR;
+    else
+    if (mm_is_pattern(matcode))
+        types[2] = MM_PATTERN_STR;
+    else
+    if (mm_is_integer(matcode))
+        types[2] = MM_INT_STR;
+    else
+        return NULL;
+
+
+    /* check for symmetry type */
+    if (mm_is_general(matcode))
+        types[3] = MM_GENERAL_STR;
+    else
+    if (mm_is_symmetric(matcode))
+        types[3] = MM_SYMM_STR;
+    else 
+    if (mm_is_hermitian(matcode))
+        types[3] = MM_HERM_STR;
+    else 
+    if (mm_is_skew(matcode))
+        types[3] = MM_SKEW_STR;
+    else
+        return NULL;
+
+    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
+    return mm_strdup(buffer);
+
+}
--- a/Samples/cuSolverDn_LinearSolver/mmio.h
+++ b/Samples/cuSolverDn_LinearSolver/mmio.h
@ -0,0 +1,141 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+#ifndef MM_IO_H
+#define MM_IO_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+#define MM_MAX_LINE_LENGTH 1025
+#define MatrixMarketBanner "%%MatrixMarket"
+#define MM_MAX_TOKEN_LENGTH 64
+
+typedef char MM_typecode[4];
+
+char *mm_typecode_to_str(MM_typecode matcode);
+
+int mm_read_banner(FILE *f, MM_typecode *matcode);
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
+int mm_read_mtx_array_size(FILE *f, int *M, int *N);
+
+int mm_write_banner(FILE *f, MM_typecode matcode);
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
+int mm_write_mtx_array_size(FILE *f, int M, int N);
+
+
+/********************* MM_typecode query fucntions ***************************/
+
+#define mm_is_matrix(typecode)	((typecode)[0]=='M')
+
+#define mm_is_sparse(typecode)	((typecode)[1]=='C')
+#define mm_is_coordinate(typecode)((typecode)[1]=='C')
+#define mm_is_dense(typecode)	((typecode)[1]=='A')
+#define mm_is_array(typecode)	((typecode)[1]=='A')
+
+#define mm_is_complex(typecode)	((typecode)[2]=='C')
+#define mm_is_real(typecode)		((typecode)[2]=='R')
+#define mm_is_pattern(typecode)	((typecode)[2]=='P')
+#define mm_is_integer(typecode) ((typecode)[2]=='I')
+
+#define mm_is_symmetric(typecode)((typecode)[3]=='S')
+#define mm_is_general(typecode)	((typecode)[3]=='G')
+#define mm_is_skew(typecode)	((typecode)[3]=='K')
+#define mm_is_hermitian(typecode)((typecode)[3]=='H')
+
+int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
+
+
+/********************* MM_typecode modify fucntions ***************************/
+
+#define mm_set_matrix(typecode)	((*typecode)[0]='M')
+#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
+#define mm_set_array(typecode)	((*typecode)[1]='A')
+#define mm_set_dense(typecode)	mm_set_array(typecode)
+#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
+
+#define mm_set_complex(typecode)((*typecode)[2]='C')
+#define mm_set_real(typecode)	((*typecode)[2]='R')
+#define mm_set_pattern(typecode)((*typecode)[2]='P')
+#define mm_set_integer(typecode)((*typecode)[2]='I')
+
+
+#define mm_set_symmetric(typecode)((*typecode)[3]='S')
+#define mm_set_general(typecode)((*typecode)[3]='G')
+#define mm_set_skew(typecode)	((*typecode)[3]='K')
+#define mm_set_hermitian(typecode)((*typecode)[3]='H')
+
+#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
+									(*typecode)[2]=' ',(*typecode)[3]='G')
+
+#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
+
+
+/********************* Matrix Market error codes ***************************/
+
+
+#define MM_COULD_NOT_READ_FILE	11
+#define MM_PREMATURE_EOF		12
+#define MM_NOT_MTX				13
+#define MM_NO_HEADER			14
+#define MM_UNSUPPORTED_TYPE		15
+#define MM_LINE_TOO_LONG		16
+#define MM_COULD_NOT_WRITE_FILE	17
+
+
+/******************** Matrix Market internal definitions ********************
+
+   MM_matrix_typecode: 4-character sequence
+
+				    ojbect 		sparse/   	data        storage 
+						  		dense     	type        scheme
+
+   string position:	 [0]        [1]			[2]         [3]
+
+   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
+						        A(array)	C(omplex)   H(ermitian)
+											P(attern)   S(ymmetric)
+								    		I(nteger)	K(kew)
+
+ ***********************************************************************/
+
+#define MM_MTX_STR		"matrix"
+#define MM_ARRAY_STR	"array"
+#define MM_DENSE_STR	"array"
+#define MM_COORDINATE_STR "coordinate" 
+#define MM_SPARSE_STR	"coordinate"
+#define MM_COMPLEX_STR	"complex"
+#define MM_REAL_STR		"real"
+#define MM_INT_STR		"integer"
+#define MM_GENERAL_STR  "general"
+#define MM_SYMM_STR		"symmetric"
+#define MM_HERM_STR		"hermitian"
+#define MM_SKEW_STR		"skew-symmetric"
+#define MM_PATTERN_STR  "pattern"
+
+
+/*  high level routines */
+int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
+        double **val, MM_typecode *matcode);
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+		 double val[], MM_typecode matcode);
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+		double val[], MM_typecode matcode);
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
+			MM_typecode matcode);
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_);
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */  
+
+#endif
--- a/Samples/cuSolverDn_LinearSolver/mmio_wrapper.cpp
+++ b/Samples/cuSolverDn_LinearSolver/mmio_wrapper.cpp
@ -0,0 +1,529 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "mmio.h"
+
+#include <cusolverDn.h>
+
+/* avoid Windows warnings (for example: strcpy, fscanf, etc.) */
+#if defined(_WIN32)  
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+/* various __inline__ __device__  function to initialize a T_ELEM */
+template <typename T_ELEM> __inline__ T_ELEM cuGet (int );
+template <> __inline__ float cuGet<float >(int x)
+{
+    return float(x);
+}
+
+template <> __inline__ double cuGet<double>(int x)
+{
+    return double(x);
+}
+
+template <> __inline__ cuComplex cuGet<cuComplex>(int x)
+{
+    return (make_cuComplex( float(x), 0.0f ));
+}
+
+template <> __inline__ cuDoubleComplex  cuGet<cuDoubleComplex>(int x)
+{
+    return (make_cuDoubleComplex( double(x), 0.0 ));
+}
+
+
+template <typename T_ELEM> __inline__ T_ELEM cuGet (int , int );
+template <> __inline__ float cuGet<float >(int x, int y)
+{
+    return float(x);
+}
+
+template <> __inline__ double cuGet<double>(int x, int y)
+{
+    return double(x);
+}
+
+template <> __inline__ cuComplex cuGet<cuComplex>(int x, int y)
+{
+    return make_cuComplex( float(x), float(y) );
+}
+
+template <> __inline__ cuDoubleComplex  cuGet<cuDoubleComplex>(int x, int y)
+{
+    return (make_cuDoubleComplex( double(x), double(y) ));
+}
+
+
+template <typename T_ELEM> __inline__ T_ELEM cuGet (float );
+template <> __inline__ float cuGet<float >(float x)
+{
+    return float(x);
+}
+
+template <> __inline__ double cuGet<double>(float x)
+{
+    return double(x);
+}
+
+template <> __inline__ cuComplex cuGet<cuComplex>(float x)
+{
+    return (make_cuComplex( float(x), 0.0f ));
+}
+
+template <> __inline__ cuDoubleComplex  cuGet<cuDoubleComplex>(float x)
+{
+    return (make_cuDoubleComplex( double(x), 0.0 ));
+}
+
+
+template <typename T_ELEM> __inline__  T_ELEM cuGet (float, float );
+template <> __inline__  float cuGet<float >(float x, float y)
+{
+    return float(x);
+}
+
+template <> __inline__  double cuGet<double>(float x, float y)
+{
+    return double(x);
+}
+
+template <> __inline__  cuComplex cuGet<cuComplex>(float x, float y)
+{
+    return (make_cuComplex( float(x), float(y) ));
+}
+
+template <> __inline__  cuDoubleComplex  cuGet<cuDoubleComplex>(float x, float y)
+{
+    return (make_cuDoubleComplex( double(x), double(y) ));
+}
+
+
+template <typename T_ELEM> __inline__ T_ELEM cuGet (double );
+template <> __inline__ float cuGet<float >(double x)
+{
+    return float(x);
+}
+
+template <> __inline__ double cuGet<double>(double x)
+{
+    return double(x);
+}
+
+template <> __inline__ cuComplex cuGet<cuComplex>(double x)
+{
+    return (make_cuComplex( float(x), 0.0f ));
+}
+
+template <> __inline__ cuDoubleComplex  cuGet<cuDoubleComplex>(double x)
+{
+    return (make_cuDoubleComplex( double(x), 0.0 ));
+}
+
+
+template <typename T_ELEM> __inline__  T_ELEM cuGet (double, double );
+template <> __inline__  float cuGet<float >(double x, double y)
+{
+    return float(x);
+}
+
+template <> __inline__  double cuGet<double>(double x, double y)
+{
+    return double(x);
+}
+
+template <> __inline__  cuComplex cuGet<cuComplex>(double x, double y)
+{
+    return (make_cuComplex( float(x), float(y) ));
+}
+
+template <> __inline__  cuDoubleComplex  cuGet<cuDoubleComplex>(double x, double y)
+{
+    return (make_cuDoubleComplex( double(x), double(y) ));
+}
+
+
+
+
+
+static void compress_index(
+    const int *Ind, 
+    int nnz, 
+    int m, 
+    int *Ptr, 
+    int base)
+{
+    int i;
+
+    /* initialize everything to zero */
+    for(i=0; i<m+1; i++){
+        Ptr[i]=0;
+    } 
+    /* count elements in every row */
+    Ptr[0]=base;
+    for(i=0; i<nnz; i++){
+        Ptr[Ind[i]+(1-base)]++;
+    } 
+    /* add all the values */
+    for(i=0; i<m; i++){
+        Ptr[i+1]+=Ptr[i];
+    } 
+}
+
+
+struct cooFormat {
+    int i ;
+    int j ;
+    int p ; // permutation
+};
+
+
+int cmp_cooFormat_csr( struct cooFormat *s, struct cooFormat *t)
+{
+    if ( s->i < t->i ){
+        return -1 ;
+    }
+    else if ( s->i > t->i ){
+        return 1 ;
+    }
+    else{
+        return s->j - t->j ;
+    }
+}
+
+int cmp_cooFormat_csc( struct cooFormat *s, struct cooFormat *t)
+{
+    if ( s->j < t->j ){
+        return -1 ;
+    }
+    else if ( s->j > t->j ){
+        return 1 ;
+    }
+    else{
+        return s->i - t->i ;
+    }
+}
+
+typedef int (*FUNPTR) (const void*, const void*)  ;
+typedef int (*FUNPTR2) ( struct cooFormat *s, struct cooFormat *t)  ;
+
+static FUNPTR2  fptr_array[2] = {
+    cmp_cooFormat_csr,
+    cmp_cooFormat_csc,
+};
+
+
+static int verify_pattern(
+    int m,
+    int nnz,
+    int *csrRowPtr,
+    int *csrColInd)
+{
+    int i, col, start, end, base_index;
+    int error_found = 0;
+
+    if (nnz != (csrRowPtr[m] - csrRowPtr[0])){
+        fprintf(stderr, "Error (nnz check failed): (csrRowPtr[%d]=%d - csrRowPtr[%d]=%d) != (nnz=%d)\n", 0, csrRowPtr[0], m, csrRowPtr[m], nnz);
+        error_found = 1;
+    }
+
+    base_index = csrRowPtr[0];
+    if ((0 != base_index) && (1 != base_index)){
+        fprintf(stderr, "Error (base index check failed): base index = %d\n", base_index);
+        error_found = 1;
+    }
+
+    for (i=0; (!error_found) && (i<m); i++){
+        start = csrRowPtr[i  ] - base_index;
+        end   = csrRowPtr[i+1] - base_index;
+        if (start > end){
+            fprintf(stderr, "Error (corrupted row): csrRowPtr[%d] (=%d) > csrRowPtr[%d] (=%d)\n", i, start+base_index, i+1, end+base_index);
+            error_found = 1;
+        }
+        for (col=start; col<end; col++){
+            if (csrColInd[col] < base_index){
+                fprintf(stderr, "Error (column vs. base index check failed): csrColInd[%d] < %d\n", col, base_index);
+                error_found = 1;
+            }
+            if ((col < (end-1)) && (csrColInd[col] >= csrColInd[col+1])){
+                fprintf(stderr, "Error (sorting of the column indecis check failed): (csrColInd[%d]=%d) >= (csrColInd[%d]=%d)\n", col, csrColInd[col], col+1, csrColInd[col+1]);
+                error_found = 1;
+            }
+        }
+    }
+    return error_found ;
+}
+
+
+template <typename T_ELEM>
+int loadMMSparseMatrix(
+    char *filename, 
+    char elem_type, 
+    bool csrFormat, 
+    int *m, 
+    int *n, 
+    int *nnz, 
+    T_ELEM **aVal, 
+    int **aRowInd, 
+    int **aColInd, 
+    int extendSymMatrix)
+{
+    MM_typecode matcode;
+    double *tempVal;
+    int    *tempRowInd,*tempColInd;
+    double *tval;
+    int    *trow,*tcol;
+    int    *csrRowPtr, *cscColPtr;
+    int    i,j,error,base,count;
+    struct cooFormat *work;
+
+    /* read the matrix */   
+    error = mm_read_mtx_crd(filename, m, n, nnz, &trow, &tcol, &tval, &matcode);
+    if (error) {
+        fprintf(stderr, "!!!! can not open file: '%s'\n", filename);
+        return 1;       
+    }
+
+    /* start error checking */
+    if (mm_is_complex(matcode) && ((elem_type != 'z') && (elem_type != 'c'))) {
+        fprintf(stderr, "!!!! complex matrix requires type 'z' or 'c'\n");
+        return 1;            
+    }
+
+    if (mm_is_dense(matcode) || mm_is_array(matcode) || mm_is_pattern(matcode) /*|| mm_is_integer(matcode)*/){
+        fprintf(stderr, "!!!! dense, array, pattern and integer matrices are not supported\n");
+        return 1;     
+    }
+
+    /* if necessary symmetrize the pattern (transform from triangular to full) */
+    if ((extendSymMatrix) && (mm_is_symmetric(matcode) || mm_is_hermitian(matcode) || mm_is_skew(matcode))){
+        //count number of non-diagonal elements
+        count=0;
+        for(i=0; i<(*nnz); i++){
+            if (trow[i] != tcol[i]){
+                count++;
+            }
+        }
+        //allocate space for the symmetrized matrix
+        tempRowInd  =    (int *)malloc((*nnz + count) * sizeof(int));
+        tempColInd  =    (int *)malloc((*nnz + count) * sizeof(int));
+        if (mm_is_real(matcode) || mm_is_integer(matcode)){
+            tempVal = (double *)malloc((*nnz + count) * sizeof(double));
+        }
+        else{
+            tempVal = (double *)malloc(2 * (*nnz + count) * sizeof(double));
+        }
+        //copy the elements regular and transposed locations
+        for(j=0, i=0; i<(*nnz); i++){
+            tempRowInd[j]=trow[i]; 
+            tempColInd[j]=tcol[i];
+            if (mm_is_real(matcode) || mm_is_integer(matcode)){
+                tempVal[j]=tval[i];
+            }
+            else{
+                tempVal[2*j]  =tval[2*i];
+                tempVal[2*j+1]=tval[2*i+1];
+            }
+            j++;
+            if (trow[i] != tcol[i]){
+                tempRowInd[j]=tcol[i];
+                tempColInd[j]=trow[i];
+                if (mm_is_real(matcode) || mm_is_integer(matcode)){
+                    if (mm_is_skew(matcode)){
+                        tempVal[j]=-tval[i];
+                    }
+                    else{
+                        tempVal[j]= tval[i];
+                    }
+                }
+                else{
+                    if(mm_is_hermitian(matcode)){
+                        tempVal[2*j]  = tval[2*i];
+                        tempVal[2*j+1]=-tval[2*i+1];
+                    }
+                    else{
+                        tempVal[2*j]  = tval[2*i];
+                        tempVal[2*j+1]= tval[2*i+1];
+                    }
+                }
+                j++;
+            }
+        }
+        (*nnz)+=count;
+        //free temporary storage
+        free(trow);
+        free(tcol);
+        free(tval);        
+    }
+    else{
+        tempRowInd=trow;
+        tempColInd=tcol;
+        tempVal   =tval;
+    }
+    // life time of (trow, tcol, tval) is over.
+    // please use COO format (tempRowInd, tempColInd, tempVal)
+
+// use qsort to sort COO format 
+    work = (struct cooFormat *)malloc(sizeof(struct cooFormat)*(*nnz));
+    if (NULL == work){
+        fprintf(stderr, "!!!! allocation error, malloc failed\n");
+        return 1;
+    }
+    for(i=0; i<(*nnz); i++){
+        work[i].i = tempRowInd[i];
+        work[i].j = tempColInd[i];
+        work[i].p = i; // permutation is identity
+    }
+ 
+    if (csrFormat){
+        /* create row-major ordering of indices (sorted by row and within each row by column) */
+        qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[0] );
+    }else{
+        /* create column-major ordering of indices (sorted by column and within each column by row) */
+        qsort(work, *nnz, sizeof(struct cooFormat), (FUNPTR)fptr_array[1] );
+
+    }
+
+    // (tempRowInd, tempColInd) is sorted either by row-major or by col-major
+    for(i=0; i<(*nnz); i++){
+        tempRowInd[i] = work[i].i;
+        tempColInd[i] = work[i].j;
+    }
+
+    // setup base 
+    // check if there is any row/col 0, if so base-0
+    // check if there is any row/col equal to matrix dimension m/n, if so base-1
+    int base0 = 0;
+    int base1 = 0;
+    for(i=0; i<(*nnz); i++){
+        const int row = tempRowInd[i];
+        const int col = tempColInd[i];
+        if ( (0 == row) || (0 == col) ){
+            base0 = 1;
+        }
+        if ( (*m == row) || (*n == col) ){
+            base1 = 1;
+        }
+    }
+    if ( base0 && base1 ){
+        printf("Error: input matrix is base-0 and base-1 \n");
+        return 1;
+    }
+
+    base = 0;
+    if (base1){
+        base = 1;
+    }
+
+    /* compress the appropriate indices */
+    if (csrFormat){
+        /* CSR format (assuming row-major format) */
+        csrRowPtr = (int *)malloc(((*m)+1) * sizeof(csrRowPtr[0]));
+        if (!csrRowPtr) return 1;          
+        compress_index(tempRowInd, *nnz, *m, csrRowPtr, base);
+
+        *aRowInd = csrRowPtr;
+        *aColInd = (int *)malloc((*nnz) * sizeof(int));
+    }
+    else {
+        /* CSC format (assuming column-major format) */
+        cscColPtr = (int *)malloc(((*n)+1) * sizeof(cscColPtr[0]));
+        if (!cscColPtr) return 1;          
+        compress_index(tempColInd, *nnz, *n, cscColPtr, base);
+
+        *aColInd = cscColPtr;
+        *aRowInd = (int *)malloc((*nnz) * sizeof(int));
+    }    
+
+    /* transfrom the matrix values of type double into one of the cusparse library types */ 
+    *aVal = (T_ELEM *)malloc((*nnz) * sizeof(T_ELEM));
+   
+    for (i=0; i<(*nnz); i++) {        
+        if (csrFormat){
+            (*aColInd)[i] = tempColInd[i];
+        }
+        else{
+            (*aRowInd)[i] = tempRowInd[i];
+        }
+        if (mm_is_real(matcode) || mm_is_integer(matcode)){
+            (*aVal)[i] = cuGet<T_ELEM>( tempVal[ work[i].p ] );
+        }
+        else{
+            (*aVal)[i] = cuGet<T_ELEM>(tempVal[2*work[i].p], tempVal[2*work[i].p+1]);
+        }
+    }
+
+    /* check for corruption */
+    int error_found;
+    if (csrFormat){
+        error_found = verify_pattern(*m, *nnz, *aRowInd, *aColInd);
+    }else{
+        error_found = verify_pattern(*n, *nnz, *aColInd, *aRowInd);
+    }
+    if (error_found){
+        fprintf(stderr, "!!!! verify_pattern failed\n");
+        return 1;
+    }
+
+    /* cleanup and exit */
+    free(work);
+    free(tempVal); 
+    free(tempColInd);
+    free(tempRowInd);
+
+    return 0;
+}   
+
+
+/* specific instantiation */
+template int loadMMSparseMatrix<float>(
+    char *filename, 
+    char elem_type, 
+    bool csrFormat, 
+    int *m, 
+    int *n, 
+    int *nnz, 
+    float  **aVal, 
+    int **aRowInd, 
+    int **aColInd, 
+    int extendSymMatrix);
+
+template int loadMMSparseMatrix<double>(
+    char *filename, 
+    char elem_type, 
+    bool csrFormat, 
+    int *m, 
+    int *n, 
+    int *nnz, 
+    double  **aVal, 
+    int **aRowInd, 
+    int **aColInd, 
+    int extendSymMatrix);
+
+template int loadMMSparseMatrix<cuComplex>(
+    char *filename, 
+    char elem_type, 
+    bool csrFormat, 
+    int *m, 
+    int *n, 
+    int *nnz, 
+    cuComplex  **aVal, 
+    int **aRowInd, 
+    int **aColInd, 
+    int extendSymMatrix);
+
+template int loadMMSparseMatrix<cuDoubleComplex>(
+    char *filename, 
+    char elem_type, 
+    bool csrFormat, 
+    int *m, 
+    int *n, 
+    int *nnz, 
+    cuDoubleComplex **aVal, 
+    int **aRowInd, 
+    int **aColInd, 
+    int extendSymMatrix);
+
+
--- a/Show More
+++ b/Show More