add and update samples for CUDA 11.5

2025-07-29 17:23:15 +08:00 · 2021-10-21 16:34:49 +05:30 · 2021-10-21 16:34:49 +05:30 · 1f76a2d110
commit 1f76a2d110
parent 3342d604fe
2796 changed files with 1511725 additions and 827 deletions
--- a/Common/UtilNPP/Exceptions.h
+++ b/Common/UtilNPP/Exceptions.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/Image.h
+++ b/Common/UtilNPP/Image.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/ImageAllocatorsCPU.h
+++ b/Common/UtilNPP/ImageAllocatorsCPU.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/ImageAllocatorsNPP.h
+++ b/Common/UtilNPP/ImageAllocatorsNPP.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/ImageIO.h
+++ b/Common/UtilNPP/ImageIO.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/ImagePacked.h
+++ b/Common/UtilNPP/ImagePacked.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/ImagesCPU.h
+++ b/Common/UtilNPP/ImagesCPU.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/ImagesNPP.h
+++ b/Common/UtilNPP/ImagesNPP.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/Pixel.h
+++ b/Common/UtilNPP/Pixel.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/Signal.h
+++ b/Common/UtilNPP/Signal.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/SignalAllocatorsCPU.h
+++ b/Common/UtilNPP/SignalAllocatorsCPU.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/SignalAllocatorsNPP.h
+++ b/Common/UtilNPP/SignalAllocatorsNPP.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/SignalsCPU.h
+++ b/Common/UtilNPP/SignalsCPU.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/UtilNPP/SignalsNPP.h
+++ b/Common/UtilNPP/SignalsNPP.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/drvapi_error_string.h
+++ b/Common/drvapi_error_string.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/dynlink_d3d10.h
+++ b/Common/dynlink_d3d10.h
@ -0,0 +1,294 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//--------------------------------------------------------------------------------------
+// File: dynlink_d3d10.h
+//
+// Shortcut macros and functions for using DX objects
+//
+// Copyright (c) Microsoft Corporation. All rights reserved
+//--------------------------------------------------------------------------------------
+
+#ifndef _DYNLINK_D3D10_H_
+#define _DYNLINK_D3D10_H_
+
+// Standard Windows includes
+#include <windows.h>
+#include <initguid.h>
+#include <assert.h>
+#include <wchar.h>
+#include <mmsystem.h>
+#include <commctrl.h> // for InitCommonControls() 
+#include <shellapi.h> // for ExtractIcon()
+#include <new.h>      // for placement new
+#include <shlobj.h>
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
+
+// CRT's memory leak detection
+#if defined(DEBUG) || defined(_DEBUG)
+#include <crtdbg.h>
+#endif
+
+// Direct3D9 includes
+#include <d3d9.h>
+
+// Direct3D10 includes
+#include <dxgi.h>
+#include <d3d10_1.h>
+#include <d3d10.h>
+
+// XInput includes
+#include <xinput.h>
+
+// strsafe.h deprecates old unsecure string functions.  If you
+// really do not want to it to (not recommended), then uncomment the next line
+//#define STRSAFE_NO_DEPRECATE
+
+#ifndef STRSAFE_NO_DEPRECATE
+#pragma deprecated("strncpy")
+#pragma deprecated("wcsncpy")
+#pragma deprecated("_tcsncpy")
+#pragma deprecated("wcsncat")
+#pragma deprecated("strncat")
+#pragma deprecated("_tcsncat")
+#endif
+
+#pragma warning( disable : 4996 ) // disable deprecated warning 
+#include <strsafe.h>
+#pragma warning( default : 4996 )
+
+#include <DirectXMath.h>
+
+using namespace DirectX;
+//--------------------------------------------------------------------------------------
+// Structs
+//--------------------------------------------------------------------------------------
+struct DXUTD3D9DeviceSettings
+{
+    UINT AdapterOrdinal;
+    D3DDEVTYPE DeviceType;
+    D3DFORMAT AdapterFormat;
+    DWORD BehaviorFlags;
+    D3DPRESENT_PARAMETERS pp;
+};
+
+struct DXUTD3D10DeviceSettings
+{
+    UINT AdapterOrdinal;
+    D3D10_DRIVER_TYPE DriverType;
+    UINT Output;
+    DXGI_SWAP_CHAIN_DESC sd;
+    UINT32 CreateFlags;
+    UINT32 SyncInterval;
+    DWORD PresentFlags;
+    bool AutoCreateDepthStencil; // DXUT will create the a depth stencil resource and view if true
+    DXGI_FORMAT AutoDepthStencilFormat;
+};
+
+enum DXUTDeviceVersion { DXUT_D3D9_DEVICE, DXUT_D3D10_DEVICE };
+struct DXUTDeviceSettings
+{
+    DXUTDeviceVersion ver;
+    union
+    {
+        DXUTD3D9DeviceSettings d3d9; // only valid if ver == DXUT_D3D9_DEVICE
+        DXUTD3D10DeviceSettings d3d10; // only valid if ver == DXUT_D3D10_DEVICE
+    };
+};
+
+
+//--------------------------------------------------------------------------------------
+// Error codes
+//--------------------------------------------------------------------------------------
+#define DXUTERR_NODIRECT3D              MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0901)
+#define DXUTERR_NOCOMPATIBLEDEVICES     MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0902)
+#define DXUTERR_MEDIANOTFOUND           MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0903)
+#define DXUTERR_NONZEROREFCOUNT         MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0904)
+#define DXUTERR_CREATINGDEVICE          MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0905)
+#define DXUTERR_RESETTINGDEVICE         MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0906)
+#define DXUTERR_CREATINGDEVICEOBJECTS   MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0907)
+#define DXUTERR_RESETTINGDEVICEOBJECTS  MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0908)
+#define DXUTERR_DEVICEREMOVED           MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x090A)
+
+
+typedef HRESULT(WINAPI *LPCREATEDXGIFACTORY)(REFIID, void **);
+typedef HRESULT(WINAPI *LPD3D10CREATEDEVICE)(IDXGIAdapter *, D3D10_DRIVER_TYPE, HMODULE, UINT, UINT32,
+                                             ID3D10Device **);
+typedef HRESULT(WINAPI *LPD3D10CREATEDEVICE1)(IDXGIAdapter *, D3D10_DRIVER_TYPE, HMODULE, UINT,
+                                              D3D10_FEATURE_LEVEL1, UINT, ID3D10Device1 **);
+typedef HRESULT(WINAPI *LPD3D10CREATESTATEBLOCK)(ID3D10Device *pDevice, D3D10_STATE_BLOCK_MASK *pStateBlockMask,
+                                                 ID3D10StateBlock **ppStateBlock);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKUNION)(D3D10_STATE_BLOCK_MASK *pA, D3D10_STATE_BLOCK_MASK *pB,
+                                                    D3D10_STATE_BLOCK_MASK *pResult);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKINTERSECT)(D3D10_STATE_BLOCK_MASK *pA, D3D10_STATE_BLOCK_MASK *pB,
+                                                        D3D10_STATE_BLOCK_MASK *pResult);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKDIFFERENCE)(D3D10_STATE_BLOCK_MASK *pA, D3D10_STATE_BLOCK_MASK *pB,
+                                                         D3D10_STATE_BLOCK_MASK *pResult);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKENABLECAPTURE)(D3D10_STATE_BLOCK_MASK *pMask,
+                                                            D3D10_DEVICE_STATE_TYPES StateType, UINT RangeStart,
+                                                            UINT RangeLength);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKDISABLECAPTURE)(D3D10_STATE_BLOCK_MASK *pMask,
+        D3D10_DEVICE_STATE_TYPES StateType, UINT RangeStart,
+        UINT RangeLength);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKENABLEALL)(D3D10_STATE_BLOCK_MASK *pMask);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKDISABLEALL)(D3D10_STATE_BLOCK_MASK *pMask);
+typedef BOOL (WINAPI *LPD3D10STATEBLOCKMASKGETSETTING)(D3D10_STATE_BLOCK_MASK *pMask,
+                                                       D3D10_DEVICE_STATE_TYPES StateType, UINT Entry);
+
+typedef HRESULT(WINAPI *LPD3D10COMPILEEFFECTFROMMEMORY)(void *pData, SIZE_T DataLength, LPCSTR pSrcFileName,
+                                                        CONST D3D10_SHADER_MACRO *pDefines,
+                                                        ID3D10Include *pInclude, UINT HLSLFlags, UINT FXFlags,
+                                                        ID3D10Blob **ppCompiledEffect, ID3D10Blob **ppErrors);
+typedef HRESULT(WINAPI *LPD3D10CREATEEFFECTFROMMEMORY)(void *pData, SIZE_T DataLength, UINT FXFlags,
+                                                       ID3D10Device *pDevice,
+                                                       ID3D10EffectPool *pEffectPool,
+                                                       ID3D10Effect **ppEffect);
+typedef HRESULT(WINAPI *LPD3D10CREATEEFFECTPOOLFROMMEMORY)(void *pData, SIZE_T DataLength, UINT FXFlags,
+                                                           ID3D10Device *pDevice, ID3D10EffectPool **ppEffectPool);
+
+typedef HRESULT(WINAPI *LPD3D10CREATEDEVICEANDSWAPCHAIN)(IDXGIAdapter *pAdapter,
+                                                         D3D10_DRIVER_TYPE DriverType,
+                                                         HMODULE Software,
+                                                         UINT Flags,
+                                                         UINT SDKVersion,
+                                                         DXGI_SWAP_CHAIN_DESC *pSwapChainDesc,
+                                                         IDXGISwapChain **ppSwapChain,
+                                                         ID3D10Device **ppDevice);
+
+typedef HRESULT(WINAPI *LPD3D10CREATEDEVICEANDSWAPCHAIN1)(IDXGIAdapter *pAdapter,
+                                                          D3D10_DRIVER_TYPE DriverType,
+                                                          HMODULE Software,
+                                                          UINT Flags,
+                                                          D3D10_FEATURE_LEVEL1 HardwareLevel,
+                                                          UINT SDKVersion,
+                                                          DXGI_SWAP_CHAIN_DESC *pSwapChainDesc,
+                                                          IDXGISwapChain **ppSwapChain,
+                                                          ID3D10Device1 **ppDevice);
+
+// Module and function pointers
+static HMODULE                              g_hModDXGI = NULL;
+static HMODULE                              g_hModD3D10 = NULL;
+static HMODULE                              g_hModD3D101 = NULL;
+static LPCREATEDXGIFACTORY                  sFnPtr_CreateDXGIFactory = NULL;
+static LPD3D10CREATESTATEBLOCK              sFnPtr_D3D10CreateStateBlock = NULL;
+static LPD3D10CREATEDEVICE                  sFnPtr_D3D10CreateDevice = NULL;
+static LPD3D10CREATEDEVICE1                 sFnPtr_D3D10CreateDevice1 = NULL;
+static LPD3D10STATEBLOCKMASKUNION           sFnPtr_D3D10StateBlockMaskUnion = NULL;
+static LPD3D10STATEBLOCKMASKINTERSECT       sFnPtr_D3D10StateBlockMaskIntersect = NULL;
+static LPD3D10STATEBLOCKMASKDIFFERENCE      sFnPtr_D3D10StateBlockMaskDifference = NULL;
+static LPD3D10STATEBLOCKMASKENABLECAPTURE   sFnPtr_D3D10StateBlockMaskEnableCapture = NULL;
+static LPD3D10STATEBLOCKMASKDISABLECAPTURE  sFnPtr_D3D10StateBlockMaskDisableCapture = NULL;
+static LPD3D10STATEBLOCKMASKENABLEALL       sFnPtr_D3D10StateBlockMaskEnableAll = NULL;
+static LPD3D10STATEBLOCKMASKDISABLEALL      sFnPtr_D3D10StateBlockMaskDisableAll = NULL;
+static LPD3D10STATEBLOCKMASKGETSETTING      sFnPtr_D3D10StateBlockMaskGetSetting = NULL;
+static LPD3D10COMPILEEFFECTFROMMEMORY       sFnPtr_D3D10CompileEffectFromMemory = NULL;
+static LPD3D10CREATEEFFECTFROMMEMORY        sFnPtr_D3D10CreateEffectFromMemory = NULL;
+static LPD3D10CREATEEFFECTPOOLFROMMEMORY    sFnPtr_D3D10CreateEffectPoolFromMemory = NULL;
+static LPD3D10CREATEDEVICEANDSWAPCHAIN      sFnPtr_D3D10CreateDeviceAndSwapChain  = NULL;
+static LPD3D10CREATEDEVICEANDSWAPCHAIN1     sFnPtr_D3D10CreateDeviceAndSwapChain1 = NULL;
+
+// unload the D3D10 DLLs
+static bool dynlinkUnloadD3D10API(void)
+{
+    if (g_hModD3D10)
+    {
+        FreeLibrary(g_hModD3D10);
+        g_hModD3D10 = NULL;
+    }
+
+    if (g_hModDXGI)
+    {
+        FreeLibrary(g_hModDXGI);
+        g_hModDXGI = NULL;
+    }
+
+    if (g_hModD3D101)
+    {
+        FreeLibrary(g_hModD3D101);
+        g_hModD3D101 = NULL;
+    }
+
+    return true;
+}
+
+// Dynamically load the D3D10 DLLs loaded and map the function pointers
+static bool dynlinkLoadD3D10API(void)
+{
+    // First check to see if the D3D10 Library is present.
+    // if it succeeds, then we can call GetProcAddress to grab all of the DX10 functions
+    g_hModD3D10 = LoadLibrary("d3d10.dll");
+
+    if (g_hModD3D10 != NULL)
+    {
+        sFnPtr_D3D10CreateStateBlock             = (LPD3D10CREATESTATEBLOCK)           GetProcAddress(g_hModD3D10, "D3D10CreateStateBlock");
+        sFnPtr_D3D10CreateDevice                 = (LPD3D10CREATEDEVICE)           GetProcAddress(g_hModD3D10, "D3D10CreateDevice");
+
+        sFnPtr_D3D10StateBlockMaskUnion          = (LPD3D10STATEBLOCKMASKUNION)        GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskUnion");
+        sFnPtr_D3D10StateBlockMaskIntersect      = (LPD3D10STATEBLOCKMASKINTERSECT)    GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskIntersect");
+        sFnPtr_D3D10StateBlockMaskDifference     = (LPD3D10STATEBLOCKMASKDIFFERENCE)   GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskDifference");
+        sFnPtr_D3D10StateBlockMaskEnableCapture  = (LPD3D10STATEBLOCKMASKENABLECAPTURE) GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskEnableCapture");
+        sFnPtr_D3D10StateBlockMaskDisableCapture = (LPD3D10STATEBLOCKMASKDISABLECAPTURE)GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskDisableCapture");
+
+        sFnPtr_D3D10StateBlockMaskEnableAll      = (LPD3D10STATEBLOCKMASKENABLEALL)    GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskEnableAll");
+        sFnPtr_D3D10StateBlockMaskDisableAll     = (LPD3D10STATEBLOCKMASKDISABLEALL)   GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskDisableAll");
+        sFnPtr_D3D10StateBlockMaskGetSetting     = (LPD3D10STATEBLOCKMASKGETSETTING)   GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskGetSetting");
+
+        sFnPtr_D3D10CompileEffectFromMemory      = (LPD3D10COMPILEEFFECTFROMMEMORY)    GetProcAddress(g_hModD3D10, "D3D10CompileEffectFromMemory");
+        sFnPtr_D3D10CreateEffectFromMemory       = (LPD3D10CREATEEFFECTFROMMEMORY)     GetProcAddress(g_hModD3D10, "D3D10CreateEffectFromMemory");
+        sFnPtr_D3D10CreateEffectPoolFromMemory   = (LPD3D10CREATEEFFECTPOOLFROMMEMORY) GetProcAddress(g_hModD3D10, "D3D10CreateEffectPoolFromMemory");
+
+        sFnPtr_D3D10CreateDeviceAndSwapChain     = (LPD3D10CREATEDEVICEANDSWAPCHAIN)    GetProcAddress(g_hModD3D10, "D3D10CreateDeviceAndSwapChain");
+    }
+
+    g_hModDXGI = LoadLibrary("dxgi.dll");
+
+    if (g_hModDXGI)
+    {
+        sFnPtr_CreateDXGIFactory                 = (LPCREATEDXGIFACTORY)           GetProcAddress(g_hModDXGI , "CreateDXGIFactory");
+    }
+
+    // This may fail if this machine isn't Windows Vista SP1 or later
+    g_hModD3D101 = LoadLibrary("d3d10_1.dll");
+
+    if (g_hModD3D101 != NULL)
+    {
+        sFnPtr_D3D10CreateDevice1                = (LPD3D10CREATEDEVICE1)              GetProcAddress(g_hModD3D101, "D3D10CreateDevice1");
+        sFnPtr_D3D10CreateDeviceAndSwapChain1    = (LPD3D10CREATEDEVICEANDSWAPCHAIN1)   GetProcAddress(g_hModD3D101, "D3D10CreateDeviceAndSwapChain1");
+    }
+
+    if (g_hModD3D10 == NULL || g_hModDXGI == NULL || g_hModD3D101 == NULL)
+    {
+        dynlinkUnloadD3D10API();
+        return false;
+    }
+
+    return true;
+}
+
+#endif
--- a/Common/dynlink_d3d11.h
+++ b/Common/dynlink_d3d11.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/exception.h
+++ b/Common/exception.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_cuda.h
+++ b/Common/helper_cuda.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_cuda_drvapi.h
+++ b/Common/helper_cuda_drvapi.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_cusolver.h
+++ b/Common/helper_cusolver.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_functions.h
+++ b/Common/helper_functions.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_gl.h
+++ b/Common/helper_gl.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_image.h
+++ b/Common/helper_image.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_math.h
+++ b/Common/helper_math.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_multiprocess.cpp
+++ b/Common/helper_multiprocess.cpp
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_multiprocess.h
+++ b/Common/helper_multiprocess.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_nvJPEG.hxx
+++ b/Common/helper_nvJPEG.hxx
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_string.h
+++ b/Common/helper_string.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/helper_timer.h
+++ b/Common/helper_timer.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/multithreading.cpp
+++ b/Common/multithreading.cpp
@ -0,0 +1,78 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <multithreading.h>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// Create thread
+CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
+  return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
+}
+
+// Wait for thread to finish
+void cutEndThread(CUTThread thread) {
+  WaitForSingleObject(thread, INFINITE);
+  CloseHandle(thread);
+}
+
+// Destroy thread
+void cutDestroyThread(CUTThread thread) {
+  TerminateThread(thread, 0);
+  CloseHandle(thread);
+}
+
+// Wait for multiple threads
+void cutWaitForThreads(const CUTThread *threads, int num) {
+  WaitForMultipleObjects(num, threads, true, INFINITE);
+
+  for (int i = 0; i < num; i++) {
+    CloseHandle(threads[i]);
+  }
+}
+
+#else
+// Create thread
+CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
+  pthread_t thread;
+  pthread_create(&thread, NULL, func, data);
+  return thread;
+}
+
+// Wait for thread to finish
+void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }
+
+// Destroy thread
+void cutDestroyThread(CUTThread thread) { pthread_cancel(thread); }
+
+// Wait for multiple threads
+void cutWaitForThreads(const CUTThread *threads, int num) {
+  for (int i = 0; i < num; i++) {
+    cutEndThread(threads[i]);
+  }
+}
+
+#endif
--- a/Common/multithreading.h
+++ b/Common/multithreading.h
@ -0,0 +1,76 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MULTITHREADING_H
+#define MULTITHREADING_H
+
+
+//Simple portable thread library.
+
+//Windows threads.
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#include <windows.h>
+
+typedef HANDLE CUTThread;
+typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);
+
+#define CUT_THREADPROC unsigned WINAPI
+#define  CUT_THREADEND return 0
+
+#else
+//POSIX threads.
+#include <pthread.h>
+
+typedef pthread_t CUTThread;
+typedef void *(*CUT_THREADROUTINE)(void *);
+
+#define CUT_THREADPROC void
+#define  CUT_THREADEND
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Create thread.
+CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
+
+//Wait for thread to finish.
+void cutEndThread(CUTThread thread);
+
+//Destroy thread.
+void cutDestroyThread(CUTThread thread);
+
+//Wait for multiple threads.
+void cutWaitForThreads(const CUTThread *threads, int num);
+
+#ifdef __cplusplus
+} //extern "C"
+#endif
+
+#endif //MULTITHREADING_H
--- a/Common/nvMath.h
+++ b/Common/nvMath.h
@ -0,0 +1,111 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// Template math library for common 3D functionality
+//
+// This code is in part deriver from glh, a cross platform glut helper library.
+// The copyright for glh follows this notice.
+//
+// Copyright (c) NVIDIA Corporation. All rights reserved.
+////////////////////////////////////////////////////////////////////////////////
+
+/*
+    Copyright (c) 2000 Cass Everitt
+    Copyright (c) 2000 NVIDIA Corporation
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or
+    without modification, are permitted provided that the following
+    conditions are met:
+
+     * Redistributions of source code must retain the above
+       copyright notice, this list of conditions and the following
+       disclaimer.
+
+     * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials
+       provided with the distribution.
+
+     * The names of contributors to this software may not be used
+       to endorse or promote products derived from this software
+       without specific prior written permission.
+
+       THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+       ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+       LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+       FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+       REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+       INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+       BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+       LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+       CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+       LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+       ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+       POSSIBILITY OF SUCH DAMAGE.
+
+
+    Cass Everitt - cass@r3.nu
+*/
+
+#ifndef NV_MATH_H
+#define NV_MATH_H
+
+#include <math.h>
+
+#include <nvVector.h>
+#include <nvMatrix.h>
+#include <nvQuaternion.h>
+
+#define NV_PI   float(3.1415926535897932384626433832795)
+
+namespace nv
+{
+
+    typedef vec2<float> vec2f;
+    typedef vec3<float> vec3f;
+    typedef vec3<int> vec3i;
+    typedef vec3<unsigned int> vec3ui;
+    typedef vec4<float> vec4f;
+    typedef matrix4<float> matrix4f;
+    typedef quaternion<float> quaternionf;
+
+
+    inline void applyRotation(const quaternionf &r)
+    {
+        float angle;
+        vec3f axis;
+        r.get_value(axis, angle);
+        glRotatef(angle/3.1415926f * 180.0f, axis[0], axis[1], axis[2]);
+    }
+
+
+
+};
+
+#endif
--- a/Common/nvMatrix.h
+++ b/Common/nvMatrix.h
@ -0,0 +1,540 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// Template math library for common 3D functionality
+//
+// nvMatrix.h - template matrix code
+//
+// This code is in part deriver from glh, a cross platform glut helper library.
+// The copyright for glh follows this notice.
+//
+// Copyright (c) NVIDIA Corporation. All rights reserved.
+////////////////////////////////////////////////////////////////////////////////
+
+/*
+    Copyright (c) 2000 Cass Everitt
+    Copyright (c) 2000 NVIDIA Corporation
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or
+    without modification, are permitted provided that the following
+    conditions are met:
+
+     * Redistributions of source code must retain the above
+       copyright notice, this list of conditions and the following
+       disclaimer.
+
+     * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials
+       provided with the distribution.
+
+     * The names of contributors to this software may not be used
+       to endorse or promote products derived from this software
+       without specific prior written permission.
+
+       THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+       ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+       LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+       FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+       REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+       INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+       BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+       LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+       CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+       LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+       ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+       POSSIBILITY OF SUCH DAMAGE.
+
+
+    Cass Everitt - cass@r3.nu
+*/
+
+#ifndef NV_MATRIX_H
+#define NV_MATRIX_H
+
+namespace nv
+{
+
+    template <class T> class vec2;
+    template <class T> class vec3;
+    template <class T> class vec4;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    //
+    //  Matrix
+    //
+    ////////////////////////////////////////////////////////////////////////////////
+    template<class T>
+    class matrix4
+    {
+
+        public:
+
+            matrix4()
+            {
+                make_identity();
+            }
+
+            matrix4(T t)
+            {
+                set_value(t);
+            }
+
+            matrix4(const T *m)
+            {
+                set_value(m);
+            }
+
+            matrix4(T a00, T a01, T a02, T a03,
+                    T a10, T a11, T a12, T a13,
+                    T a20, T a21, T a22, T a23,
+                    T a30, T a31, T a32, T a33) :
+                _11(a00), _12(a01), _13(a02), _14(a03),
+                _21(a10), _22(a11), _23(a12), _24(a13),
+                _31(a20), _32(a21), _33(a22), _34(a23),
+                _41(a30), _42(a31), _43(a32), _44(a33)
+            {}
+
+
+            void get_value(T *mp) const
+            {
+                int c = 0;
+
+                for (int j=0; j < 4; j++)
+                    for (int i=0; i < 4; i++)
+                    {
+                        mp[c++] = element(i,j);
+                    }
+            }
+
+            const T *get_value() const
+            {
+                return _array;
+            }
+
+            void set_value(T *mp)
+            {
+                int c = 0;
+
+                for (int j=0; j < 4; j++)
+                    for (int i=0; i < 4; i++)
+                    {
+                        element(i,j) = mp[c++];
+                    }
+            }
+
+            void set_value(T r)
+            {
+                for (int i=0; i < 4; i++)
+                    for (int j=0; j < 4; j++)
+                    {
+                        element(i,j) = r;
+                    }
+            }
+
+            void make_identity()
+            {
+                element(0,0) = 1.0;
+                element(0,1) = 0.0;
+                element(0,2) = 0.0;
+                element(0,3) = 0.0;
+
+                element(1,0) = 0.0;
+                element(1,1) = 1.0;
+                element(1,2) = 0.0;
+                element(1,3) = 0.0;
+
+                element(2,0) = 0.0;
+                element(2,1) = 0.0;
+                element(2,2) = 1.0;
+                element(2,3) = 0.0;
+
+                element(3,0) = 0.0;
+                element(3,1) = 0.0;
+                element(3,2) = 0.0;
+                element(3,3) = 1.0;
+            }
+
+            // set a uniform scale
+            void set_scale(T s)
+            {
+                element(0,0) = s;
+                element(1,1) = s;
+                element(2,2) = s;
+            }
+
+            void set_scale(const vec3<T> &s)
+            {
+                for (int i = 0; i < 3; i++)
+                {
+                    element(i,i) = s[i];
+                }
+            }
+
+
+            void set_translate(const vec3<T> &t)
+            {
+                for (int i = 0; i < 3; i++)
+                {
+                    element(i,3) = t[i];
+                }
+            }
+
+            void set_row(int r, const vec4<T> &t)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    element(r,i) = t[i];
+                }
+            }
+
+            void set_column(int c, const vec4<T> &t)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    element(i,c) = t[i];
+                }
+            }
+
+            vec4<T> get_row(int r) const
+            {
+                vec4<T> v;
+
+                for (int i = 0; i < 4; i++)
+                {
+                    v[i] = element(r,i);
+                }
+
+                return v;
+            }
+
+            vec4<T> get_column(int c) const
+            {
+                vec4<T> v;
+
+                for (int i = 0; i < 4; i++)
+                {
+                    v[i] = element(i,c);
+                }
+
+                return v;
+            }
+
+            friend matrix4 inverse(const matrix4 &m)
+            {
+                matrix4 minv;
+
+                T r1[8], r2[8], r3[8], r4[8];
+                T *s[4], *tmprow;
+
+                s[0] = &r1[0];
+                s[1] = &r2[0];
+                s[2] = &r3[0];
+                s[3] = &r4[0];
+
+                register int i,j,p,jj;
+
+                for (i=0; i<4; i++)
+                {
+                    for (j=0; j<4; j++)
+                    {
+                        s[i][j] = m.element(i,j);
+
+                        if (i==j)
+                        {
+                            s[i][j+4] = 1.0;
+                        }
+                        else
+                        {
+                            s[i][j+4] = 0.0;
+                        }
+                    }
+                }
+
+                T scp[4];
+
+                for (i=0; i<4; i++)
+                {
+                    scp[i] = T(fabs(s[i][0]));
+
+                    for (j=1; j<4; j++)
+                        if (T(fabs(s[i][j])) > scp[i])
+                        {
+                            scp[i] = T(fabs(s[i][j]));
+                        }
+
+                    if (scp[i] == 0.0)
+                    {
+                        return minv;    // singular matrix!
+                    }
+                }
+
+                int pivot_to;
+                T scp_max;
+
+                for (i=0; i<4; i++)
+                {
+                    // select pivot row
+                    pivot_to = i;
+                    scp_max = T(fabs(s[i][i]/scp[i]));
+
+                    // find out which row should be on top
+                    for (p=i+1; p<4; p++)
+                        if (T(fabs(s[p][i]/scp[p])) > scp_max)
+                        {
+                            scp_max = T(fabs(s[p][i]/scp[p]));
+                            pivot_to = p;
+                        }
+
+                    // Pivot if necessary
+                    if (pivot_to != i)
+                    {
+                        tmprow = s[i];
+                        s[i] = s[pivot_to];
+                        s[pivot_to] = tmprow;
+                        T tmpscp;
+                        tmpscp = scp[i];
+                        scp[i] = scp[pivot_to];
+                        scp[pivot_to] = tmpscp;
+                    }
+
+                    T mji;
+
+                    // perform gaussian elimination
+                    for (j=i+1; j<4; j++)
+                    {
+                        mji = s[j][i]/s[i][i];
+                        s[j][i] = 0.0;
+
+                        for (jj=i+1; jj<8; jj++)
+                        {
+                            s[j][jj] -= mji*s[i][jj];
+                        }
+                    }
+                }
+
+                if (s[3][3] == 0.0)
+                {
+                    return minv;    // singular matrix!
+                }
+
+                //
+                // Now we have an upper triangular matrix.
+                //
+                //  x x x x | y y y y
+                //  0 x x x | y y y y
+                //  0 0 x x | y y y y
+                //  0 0 0 x | y y y y
+                //
+                //  we'll back substitute to get the inverse
+                //
+                //  1 0 0 0 | z z z z
+                //  0 1 0 0 | z z z z
+                //  0 0 1 0 | z z z z
+                //  0 0 0 1 | z z z z
+                //
+
+                T mij;
+
+                for (i=3; i>0; i--)
+                {
+                    for (j=i-1; j > -1; j--)
+                    {
+                        mij = s[j][i]/s[i][i];
+
+                        for (jj=j+1; jj<8; jj++)
+                        {
+                            s[j][jj] -= mij*s[i][jj];
+                        }
+                    }
+                }
+
+                for (i=0; i<4; i++)
+                    for (j=0; j<4; j++)
+                    {
+                        minv(i,j) = s[i][j+4] / s[i][i];
+                    }
+
+                return minv;
+            }
+
+
+            friend matrix4 transpose(const matrix4 &m)
+            {
+                matrix4 mtrans;
+
+                for (int i=0; i<4; i++)
+                    for (int j=0; j<4; j++)
+                    {
+                        mtrans(i,j) = m.element(j,i);
+                    }
+
+                return mtrans;
+            }
+
+            matrix4 &operator *= (const matrix4 &rhs)
+            {
+                matrix4 mt(*this);
+                set_value(T(0));
+
+                for (int i=0; i < 4; i++)
+                    for (int j=0; j < 4; j++)
+                        for (int c=0; c < 4; c++)
+                        {
+                            element(i,j) += mt(i,c) * rhs(c,j);
+                        }
+
+                return *this;
+            }
+
+            friend matrix4 operator * (const matrix4 &lhs, const matrix4 &rhs)
+            {
+                matrix4 r(T(0));
+
+                for (int i=0; i < 4; i++)
+                    for (int j=0; j < 4; j++)
+                        for (int c=0; c < 4; c++)
+                        {
+                            r.element(i,j) += lhs(i,c) * rhs(c,j);
+                        }
+
+                return r;
+            }
+
+            // dst = M * src
+            vec4<T> operator *(const vec4<T> &src) const
+            {
+                vec4<T> r;
+
+                for (int i = 0; i < 4; i++)
+                    r[i]  = (src[0] * element(i,0) + src[1] * element(i,1) +
+                             src[2] * element(i,2) + src[3] * element(i,3));
+
+                return r;
+            }
+
+            // dst = src * M
+            friend vec4<T> operator *(const vec4<T> &lhs, const matrix4 &rhs)
+            {
+                vec4<T> r;
+
+                for (int i = 0; i < 4; i++)
+                    r[i]  = (lhs[0] * rhs.element(0,i) + lhs[1] * rhs.element(1,i) +
+                             lhs[2] * rhs.element(2,i) + lhs[3] * rhs.element(3,i));
+
+                return r;
+            }
+
+            T &operator()(int row, int col)
+            {
+                return element(row,col);
+            }
+
+            const T &operator()(int row, int col) const
+            {
+                return element(row,col);
+            }
+
+            T &element(int row, int col)
+            {
+                return _array[row | (col<<2)];
+            }
+
+            const T &element(int row, int col) const
+            {
+                return _array[row | (col<<2)];
+            }
+
+            matrix4 &operator *= (const T &r)
+            {
+                for (int i = 0; i < 4; ++i)
+                {
+                    element(0,i) *= r;
+                    element(1,i) *= r;
+                    element(2,i) *= r;
+                    element(3,i) *= r;
+                }
+
+                return *this;
+            }
+
+            matrix4 &operator += (const matrix4 &mat)
+            {
+                for (int i = 0; i < 4; ++i)
+                {
+                    element(0,i) += mat.element(0,i);
+                    element(1,i) += mat.element(1,i);
+                    element(2,i) += mat.element(2,i);
+                    element(3,i) += mat.element(3,i);
+                }
+
+                return *this;
+            }
+
+
+            friend bool operator == (const matrix4 &lhs, const matrix4 &rhs)
+            {
+                bool r = true;
+
+                for (int i = 0; i < 16; i++)
+                {
+                    r &= lhs._array[i] == rhs._array[i];
+                }
+
+                return r;
+            }
+
+            friend bool operator != (const matrix4 &lhs, const matrix4 &rhs)
+            {
+                bool r = true;
+
+                for (int i = 0; i < 16; i++)
+                {
+                    r &= lhs._array[i] != rhs._array[i];
+                }
+
+                return r;
+            }
+
+            union
+            {
+                struct
+                {
+                    T _11, _12, _13, _14;   // standard names for components
+                    T _21, _22, _23, _24;   // standard names for components
+                    T _31, _32, _33, _34;   // standard names for components
+                    T _41, _42, _43, _44;   // standard names for components
+                };
+                T _array[16];     // array access
+            };
+    };
+
+};
+
+#endif
--- a/Common/nvQuaternion.h
+++ b/Common/nvQuaternion.h
@ -0,0 +1,530 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// Template math library for common 3D functionality
+//
+// nvQuaterion.h - quaternion template and utility functions
+//
+// This code is in part deriver from glh, a cross platform glut helper library.
+// The copyright for glh follows this notice.
+//
+// Copyright (c) NVIDIA Corporation. All rights reserved.
+////////////////////////////////////////////////////////////////////////////////
+
+/*
+    Copyright (c) 2000 Cass Everitt
+    Copyright (c) 2000 NVIDIA Corporation
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or
+    without modification, are permitted provided that the following
+    conditions are met:
+
+     * Redistributions of source code must retain the above
+       copyright notice, this list of conditions and the following
+       disclaimer.
+
+     * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials
+       provided with the distribution.
+
+     * The names of contributors to this software may not be used
+       to endorse or promote products derived from this software
+       without specific prior written permission.
+
+       THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+       ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+       LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+       FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+       REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+       INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+       BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+       LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+       CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+       LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+       ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+       POSSIBILITY OF SUCH DAMAGE.
+
+
+    Cass Everitt - cass@r3.nu
+*/
+#ifndef NV_QUATERNION_H
+#define NV_QUATERNION_H
+
+namespace nv
+{
+
+    template <class T> class vec2;
+    template <class T> class vec3;
+    template <class T> class vec4;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    //
+    //  Quaternion
+    //
+    ////////////////////////////////////////////////////////////////////////////////
+
+    template< class T>
+    class quaternion
+    {
+        public:
+
+            quaternion() : x(0.0), y(0.0), z(0.0), w(0.0)
+            {
+            }
+
+            quaternion(const T v[4])
+            {
+                set_value(v);
+            }
+
+
+            quaternion(T q0, T q1, T q2, T q3)
+            {
+                set_value(q0, q1, q2, q3);
+            }
+
+
+            quaternion(const matrix4<T> &m)
+            {
+                set_value(m);
+            }
+
+
+            quaternion(const vec3<T> &axis, T radians)
+            {
+                set_value(axis, radians);
+            }
+
+
+            quaternion(const vec3<T> &rotateFrom, const vec3<T> &rotateTo)
+            {
+                set_value(rotateFrom, rotateTo);
+            }
+
+            quaternion(const vec3<T> &from_look, const vec3<T> &from_up,
+                       const vec3<T> &to_look, const vec3<T> &to_up)
+            {
+                set_value(from_look, from_up, to_look, to_up);
+            }
+
+            const T *get_value() const
+            {
+                return  &_array[0];
+            }
+
+            void get_value(T &q0, T &q1, T &q2, T &q3) const
+            {
+                q0 = _array[0];
+                q1 = _array[1];
+                q2 = _array[2];
+                q3 = _array[3];
+            }
+
+            quaternion &set_value(T q0, T q1, T q2, T q3)
+            {
+                _array[0] = q0;
+                _array[1] = q1;
+                _array[2] = q2;
+                _array[3] = q3;
+                return *this;
+            }
+
+            void get_value(vec3<T> &axis, T &radians) const
+            {
+                radians = T(acos(_array[3]) * T(2.0));
+
+                if (radians == T(0.0))
+                {
+                    axis = vec3<T>(0.0, 0.0, 1.0);
+                }
+                else
+                {
+                    axis[0] = _array[0];
+                    axis[1] = _array[1];
+                    axis[2] = _array[2];
+                    axis = normalize(axis);
+                }
+            }
+
+            void get_value(matrix4<T> &m) const
+            {
+                T s, xs, ys, zs, wx, wy, wz, xx, xy, xz, yy, yz, zz;
+
+                T norm = _array[0] * _array[0] + _array[1] * _array[1] + _array[2] * _array[2] + _array[3] * _array[3];
+
+                s = (norm == T(0.0)) ? T(0.0) : (T(2.0) / norm);
+
+                xs = _array[0] * s;
+                ys = _array[1] * s;
+                zs = _array[2] * s;
+
+                wx = _array[3] * xs;
+                wy = _array[3] * ys;
+                wz = _array[3] * zs;
+
+                xx = _array[0] * xs;
+                xy = _array[0] * ys;
+                xz = _array[0] * zs;
+
+                yy = _array[1] * ys;
+                yz = _array[1] * zs;
+                zz = _array[2] * zs;
+
+                m(0,0) = T(T(1.0) - (yy + zz));
+                m(1,0) = T(xy + wz);
+                m(2,0) = T(xz - wy);
+
+                m(0,1) = T(xy - wz);
+                m(1,1) = T(T(1.0) - (xx + zz));
+                m(2,1) = T(yz + wx);
+
+                m(0,2) = T(xz + wy);
+                m(1,2) = T(yz - wx);
+                m(2,2) = T(T(1.0) - (xx + yy));
+
+                m(3,0) = m(3,1) = m(3,2) = m(0,3) = m(1,3) = m(2,3) = T(0.0);
+                m(3,3) = T(1.0);
+            }
+
+            quaternion &set_value(const T *qp)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    _array[i] = qp[i];
+                }
+
+                return *this;
+            }
+
+            quaternion &set_value(const matrix4<T> &m)
+            {
+                T tr, s;
+                int i, j, k;
+                const int nxt[3] = { 1, 2, 0 };
+
+                tr = m(0,0) + m(1,1) + m(2,2);
+
+                if (tr > T(0))
+                {
+                    s = T(sqrt(tr + m(3,3)));
+                    _array[3] = T(s * 0.5);
+                    s = T(0.5) / s;
+
+                    _array[0] = T((m(1,2) - m(2,1)) * s);
+                    _array[1] = T((m(2,0) - m(0,2)) * s);
+                    _array[2] = T((m(0,1) - m(1,0)) * s);
+                }
+                else
+                {
+                    i = 0;
+
+                    if (m(1,1) > m(0,0))
+                    {
+                        i = 1;
+                    }
+
+                    if (m(2,2) > m(i,i))
+                    {
+                        i = 2;
+                    }
+
+                    j = nxt[i];
+                    k = nxt[j];
+
+                    s = T(sqrt((m(i,j) - (m(j,j) + m(k,k))) + T(1.0)));
+
+                    _array[i] = T(s * 0.5);
+                    s = T(0.5 / s);
+
+                    _array[3] = T((m(j,k) - m(k,j)) * s);
+                    _array[j] = T((m(i,j) + m(j,i)) * s);
+                    _array[k] = T((m(i,k) + m(k,i)) * s);
+                }
+
+                return *this;
+            }
+
+            quaternion &set_value(const vec3<T> &axis, T theta)
+            {
+                T sqnorm = square_norm(axis);
+
+                if (sqnorm == T(0.0))
+                {
+                    // axis too small.
+                    x = y = z = T(0.0);
+                    w = T(1.0);
+                }
+                else
+                {
+                    theta *= T(0.5);
+                    T sin_theta = T(sin(theta));
+
+                    if (sqnorm != T(1))
+                    {
+                        sin_theta /= T(sqrt(sqnorm));
+                    }
+
+                    x = sin_theta * axis[0];
+                    y = sin_theta * axis[1];
+                    z = sin_theta * axis[2];
+                    w = T(cos(theta));
+                }
+
+                return *this;
+            }
+
+            quaternion &set_value(const vec3<T> &rotateFrom, const vec3<T> &rotateTo)
+            {
+                vec3<T> p1, p2;
+                T alpha;
+
+                p1 = normalize(rotateFrom);
+                p2 = normalize(rotateTo);
+
+                alpha = dot(p1, p2);
+
+                if (alpha == T(1.0))
+                {
+                    *this = quaternion();
+                    return *this;
+                }
+
+                // ensures that the anti-parallel case leads to a positive dot
+                if (alpha == T(-1.0))
+                {
+                    vec3<T> v;
+
+                    if (p1[0] != p1[1] || p1[0] != p1[2])
+                    {
+                        v = vec3<T>(p1[1], p1[2], p1[0]);
+                    }
+                    else
+                    {
+                        v = vec3<T>(-p1[0], p1[1], p1[2]);
+                    }
+
+                    v -= p1 * dot(p1, v);
+                    v = normalize(v);
+
+                    set_value(v, T(3.1415926));
+                    return *this;
+                }
+
+                p1 = normalize(cross(p1, p2));
+
+                set_value(p1,T(acos(alpha)));
+
+                return *this;
+            }
+
+            quaternion &set_value(const vec3<T> &from_look, const vec3<T> &from_up,
+                                  const vec3<T> &to_look, const vec3<T> &to_up)
+            {
+                quaternion r_look = quaternion(from_look, to_look);
+
+                vec3<T> rotated_from_up(from_up);
+                r_look.mult_vec(rotated_from_up);
+
+                quaternion r_twist = quaternion(rotated_from_up, to_up);
+
+                *this = r_twist;
+                *this *= r_look;
+                return *this;
+            }
+
+            quaternion &operator *= (const quaternion<T> &qr)
+            {
+                quaternion ql(*this);
+
+                w = ql.w * qr.w - ql.x * qr.x - ql.y * qr.y - ql.z * qr.z;
+                x = ql.w * qr.x + ql.x * qr.w + ql.y * qr.z - ql.z * qr.y;
+                y = ql.w * qr.y + ql.y * qr.w + ql.z * qr.x - ql.x * qr.z;
+                z = ql.w * qr.z + ql.z * qr.w + ql.x * qr.y - ql.y * qr.x;
+
+                return *this;
+            }
+
+            friend quaternion normalize(const quaternion<T> &q)
+            {
+                quaternion r(q);
+                T rnorm = T(1.0) / T(sqrt(q.w * q.w + q.x * q.x + q.y * q.y + q.z * q.z));
+
+                r.x *= rnorm;
+                r.y *= rnorm;
+                r.z *= rnorm;
+                r.w *= rnorm;
+            }
+
+            friend quaternion<T> conjugate(const quaternion<T> &q)
+            {
+                quaternion<T> r(q);
+                r._array[0] *= T(-1.0);
+                r._array[1] *= T(-1.0);
+                r._array[2] *= T(-1.0);
+                return r;
+            }
+
+            friend quaternion<T> inverse(const quaternion<T> &q)
+            {
+                return conjugate(q);
+            }
+
+            //
+            // Quaternion multiplication with cartesian vector
+            // v' = q*v*q(star)
+            //
+            void mult_vec(const vec3<T> &src, vec3<T> &dst) const
+            {
+                T v_coef = w * w - x * x - y * y - z * z;
+                T u_coef = T(2.0) * (src[0] * x + src[1] * y + src[2] * z);
+                T c_coef = T(2.0) * w;
+
+                dst.v[0] = v_coef * src.v[0] + u_coef * x + c_coef * (y * src.v[2] - z * src.v[1]);
+                dst.v[1] = v_coef * src.v[1] + u_coef * y + c_coef * (z * src.v[0] - x * src.v[2]);
+                dst.v[2] = v_coef * src.v[2] + u_coef * z + c_coef * (x * src.v[1] - y * src.v[0]);
+            }
+
+            void mult_vec(vec3<T> &src_and_dst) const
+            {
+                mult_vec(vec3<T>(src_and_dst), src_and_dst);
+            }
+
+            void scale_angle(T scaleFactor)
+            {
+                vec3<T> axis;
+                T radians;
+
+                get_value(axis, radians);
+                radians *= scaleFactor;
+                set_value(axis, radians);
+            }
+
+            friend quaternion<T> slerp(const quaternion<T> &p, const quaternion<T> &q, T alpha)
+            {
+                quaternion r;
+
+                T cos_omega = p.x * q.x + p.y * q.y + p.z * q.z + p.w * q.w;
+                // if B is on opposite hemisphere from A, use -B instead
+
+                int bflip;
+
+                if ((bflip = (cos_omega < T(0))))
+                {
+                    cos_omega = -cos_omega;
+                }
+
+                // complementary interpolation parameter
+                T beta = T(1) - alpha;
+
+                if (cos_omega >= T(1))
+                {
+                    return p;
+                }
+
+                T omega = T(acos(cos_omega));
+                T one_over_sin_omega = T(1.0) / T(sin(omega));
+
+                beta    = T(sin(omega*beta)  * one_over_sin_omega);
+                alpha   = T(sin(omega*alpha) * one_over_sin_omega);
+
+                if (bflip)
+                {
+                    alpha = -alpha;
+                }
+
+                r.x = beta * p._array[0]+ alpha * q._array[0];
+                r.y = beta * p._array[1]+ alpha * q._array[1];
+                r.z = beta * p._array[2]+ alpha * q._array[2];
+                r.w = beta * p._array[3]+ alpha * q._array[3];
+                return r;
+            }
+
+            T &operator [](int i)
+            {
+                return _array[i];
+            }
+
+            const T &operator [](int i) const
+            {
+                return _array[i];
+            }
+
+
+            friend bool operator == (const quaternion<T> &lhs, const quaternion<T> &rhs)
+            {
+                bool r = true;
+
+                for (int i = 0; i < 4; i++)
+                {
+                    r &= lhs._array[i] == rhs._array[i];
+                }
+
+                return r;
+            }
+
+            friend bool operator != (const quaternion<T> &lhs, const quaternion<T> &rhs)
+            {
+                bool r = true;
+
+                for (int i = 0; i < 4; i++)
+                {
+                    r &= lhs._array[i] == rhs._array[i];
+                }
+
+                return r;
+            }
+
+            friend quaternion<T> operator * (const quaternion<T> &lhs, const quaternion<T> &rhs)
+            {
+                quaternion r(lhs);
+                r *= rhs;
+                return r;
+            }
+
+
+            union
+            {
+                struct
+                {
+                    T x;
+                    T y;
+                    T z;
+                    T w;
+                };
+                T _array[4];
+            };
+
+    };
+
+
+
+};
+
+#endif
--- a/Common/nvShaderUtils.h
+++ b/Common/nvShaderUtils.h
@ -0,0 +1,260 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ *
+ * Utility functions for compiling shaders and programs
+ *
+ * Author: Evan Hart
+ * Copyright (c) NVIDIA Corporation. All rights reserved.
+ *
+ */
+
+
+#ifndef NV_SHADER_UTILS_H
+#define NV_SHADER_UTILS_H
+
+#include <stdio.h>
+#include <string.h>
+
+namespace nv {
+
+//
+//
+////////////////////////////////////////////////////////////
+inline GLuint CompileGLSLShader(GLenum target, const char *shader) {
+  GLuint object;
+
+  object = glCreateShader(target);
+
+  if (!object) {
+    return object;
+  }
+
+  glShaderSource(object, 1, &shader, NULL);
+
+  glCompileShader(object);
+
+  // check if shader compiled
+  GLint compiled = 0;
+  glGetShaderiv(object, GL_COMPILE_STATUS, &compiled);
+
+  if (!compiled) {
+#ifdef NV_REPORT_COMPILE_ERRORS
+    char temp[256] = "";
+    glGetShaderInfoLog(object, 256, NULL, temp);
+    fprintf(stderr, "Compile failed:\n%s\n", temp);
+#endif
+    glDeleteShader(object);
+    return 0;
+  }
+
+  return object;
+}
+
+//
+//
+////////////////////////////////////////////////////////////
+inline GLuint CompileGLSLShaderFromFile(GLenum target, const char *filename) {
+  FILE *shaderFile;
+  char *text;
+  long size;
+  size_t fsize = 0;
+
+  // read files as binary to prevent problems from newline translation
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+
+  if (fopen_s(&shaderFile, filename, "rb") != 0)
+#else
+  if ((shaderFile = fopen(filename, "rb")) == 0)
+#endif
+  {
+    return 0;
+  }
+
+  // Get the length of the file
+  fseek(shaderFile, 0, SEEK_END);
+  size = ftell(shaderFile);
+
+  // Read the file contents from the start, then close file and add a null
+  // terminator
+  fseek(shaderFile, 0, SEEK_SET);
+  text = new char[size + 1];
+  fsize = fread(text, size, 1, shaderFile);
+  fclose(shaderFile);
+
+  if (fsize == 0) {
+    printf("CompileGLSLShaderFromFile(), error... fsize = 0\n");
+  }
+
+  text[size] = '\0';
+
+  GLuint object = CompileGLSLShader(target, text);
+
+  delete[] text;
+
+  return object;
+}
+
+// Create a program composed of vertex and fragment shaders.
+inline GLuint LinkGLSLProgram(GLuint vertexShader, GLuint fragmentShader) {
+  GLuint program = glCreateProgram();
+  glAttachShader(program, vertexShader);
+  glAttachShader(program, fragmentShader);
+  glLinkProgram(program);
+
+#ifdef NV_REPORT_COMPILE_ERRORS
+  // Get error log.
+  GLint charsWritten, infoLogLength;
+  glGetProgramiv(program, GL_INFO_LOG_LENGTH, &infoLogLength);
+
+  char *infoLog = new char[infoLogLength];
+  glGetProgramInfoLog(program, infoLogLength, &charsWritten, infoLog);
+  printf(infoLog);
+  delete[] infoLog;
+#endif
+
+  // Test linker result.
+  GLint linkSucceed = GL_FALSE;
+  glGetProgramiv(program, GL_LINK_STATUS, &linkSucceed);
+
+  if (linkSucceed == GL_FALSE) {
+    glDeleteProgram(program);
+    return 0;
+  }
+
+  return program;
+}
+
+// Create a program composed of vertex, geometry and fragment shaders.
+inline GLuint LinkGLSLProgram(GLuint vertexShader, GLuint geometryShader,
+                              GLint inputType, GLint vertexOut,
+                              GLint outputType, GLuint fragmentShader) {
+  GLuint program = glCreateProgram();
+  glAttachShader(program, vertexShader);
+  glAttachShader(program, geometryShader);
+  glProgramParameteriEXT(program, GL_GEOMETRY_INPUT_TYPE_EXT, inputType);
+  glProgramParameteriEXT(program, GL_GEOMETRY_VERTICES_OUT_EXT, vertexOut);
+  glProgramParameteriEXT(program, GL_GEOMETRY_OUTPUT_TYPE_EXT, outputType);
+  glAttachShader(program, fragmentShader);
+  glLinkProgram(program);
+
+#ifdef NV_REPORT_COMPILE_ERRORS
+  // Get error log.
+  GLint charsWritten, infoLogLength;
+  glGetProgramiv(program, GL_INFO_LOG_LENGTH, &infoLogLength);
+
+  char *infoLog = new char[infoLogLength];
+  glGetProgramInfoLog(program, infoLogLength, &charsWritten, infoLog);
+  printf(infoLog);
+  delete[] infoLog;
+#endif
+
+  // Test linker result.
+  GLint linkSucceed = GL_FALSE;
+  glGetProgramiv(program, GL_LINK_STATUS, &linkSucceed);
+
+  if (linkSucceed == GL_FALSE) {
+    glDeleteProgram(program);
+    return 0;
+  }
+
+  return program;
+}
+
+//
+//
+////////////////////////////////////////////////////////////
+inline GLuint CompileASMShader(GLenum program_type, const char *code) {
+  GLuint program_id;
+  glGenProgramsARB(1, &program_id);
+  glBindProgramARB(program_type, program_id);
+  glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB,
+                     (GLsizei)strlen(code), (GLubyte *)code);
+
+  GLint error_pos;
+  glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos);
+
+  if (error_pos != -1) {
+#ifdef NV_REPORT_COMPILE_ERRORS
+    const GLubyte *error_string;
+    error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB);
+    fprintf(stderr, "Program error at position: %d\n%s\n", (int)error_pos,
+            error_string);
+#endif
+    return 0;
+  }
+
+  return program_id;
+}
+
+//
+//
+////////////////////////////////////////////////////////////
+inline GLuint CompileASMShaderFromFile(GLenum target, const char *filename) {
+  FILE *shaderFile;
+  char *text;
+  long size;
+  size_t fsize = 0;
+
+  // read files as binary to prevent problems from newline translation
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+
+  if (fopen_s(&shaderFile, filename, "rb") != 0)
+#else
+  if ((shaderFile = fopen(filename, "rb")) == 0)
+#endif
+  {
+    return 0;
+  }
+
+  // Get the length of the file
+  fseek(shaderFile, 0, SEEK_END);
+  size = ftell(shaderFile);
+
+  // Read the file contents from the start, then close file and add a null
+  // terminator
+  fseek(shaderFile, 0, SEEK_SET);
+  text = new char[size + 1];
+  fsize = fread(text, size, 1, shaderFile);
+  fclose(shaderFile);
+
+  if (fsize == 0) {
+    printf("CompileGLSLShaderFromFile(), error... fsize = 0\n");
+  }
+
+  text[size] = '\0';
+
+  GLuint program_id = CompileASMShader(target, text);
+
+  delete[] text;
+
+  return program_id;
+}
+
+}  // namespace nv
+#endif
--- a/Common/nvVector.h
+++ b/Common/nvVector.h
--- a/Common/nvrtc_helper.h
+++ b/Common/nvrtc_helper.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/param.h
+++ b/Common/param.h
@ -0,0 +1,236 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ Simple parameter system
+ sgreen@nvidia.com 4/2001
+*/
+
+#ifndef PARAM_H
+#define PARAM_H
+
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+// base class for named parameter
+class ParamBase {
+ public:
+  ParamBase(const char *name) : m_name(name) {}
+  virtual ~ParamBase() {}
+
+  std::string &GetName() { return m_name; }
+
+  virtual float GetFloatValue() = 0;
+  virtual int GetIntValue() = 0;
+  virtual std::string GetValueString() = 0;
+
+  virtual void Reset() = 0;
+  virtual void Increment() = 0;
+  virtual void Decrement() = 0;
+
+  virtual float GetPercentage() = 0;
+  virtual void SetPercentage(float p) = 0;
+
+  virtual void Write(std::ostream &stream) = 0;
+  virtual void Read(std::istream &stream) = 0;
+
+  virtual bool IsList() = 0;
+
+ protected:
+  std::string m_name;
+};
+
+// derived class for single-valued parameter
+template <class T>
+class Param : public ParamBase {
+ public:
+  Param(const char *name, T value = 0, T min = 0, T max = 10000, T step = 1,
+        T *ptr = 0)
+      : ParamBase(name),
+        m_default(value),
+        m_min(min),
+        m_max(max),
+        m_step(step),
+        m_precision(3) {
+    if (ptr) {
+      m_ptr = ptr;
+    } else {
+      m_ptr = &m_value;
+    }
+
+    *m_ptr = value;
+  }
+  ~Param() {}
+
+  T GetValue() const { return *m_ptr; }
+  T SetValue(const T value) { *m_ptr = value; }
+
+  float GetFloatValue() { return (float)*m_ptr; }
+  int GetIntValue() { return (int)*m_ptr; }
+
+  std::string GetValueString() {
+    std::ostringstream ost;
+    ost << std::setprecision(m_precision) << std::fixed;
+    ost << *m_ptr;
+    return ost.str();
+  }
+
+  void SetPrecision(int x) { m_precision = x; }
+
+  float GetPercentage() { return (*m_ptr - m_min) / (float)(m_max - m_min); }
+
+  void SetPercentage(float p) { *m_ptr = (T)(m_min + p * (m_max - m_min)); }
+
+  void Reset() { *m_ptr = m_default; }
+
+  void Increment() {
+    *m_ptr += m_step;
+
+    if (*m_ptr > m_max) {
+      *m_ptr = m_max;
+    }
+  }
+
+  void Decrement() {
+    *m_ptr -= m_step;
+
+    if (*m_ptr < m_min) {
+      *m_ptr = m_min;
+    }
+  }
+
+  void Write(std::ostream &stream) {
+    stream << m_name << " " << *m_ptr << '\n';
+  }
+  void Read(std::istream &stream) { stream >> m_name >> *m_ptr; }
+
+  bool IsList() { return false; }
+
+ private:
+  T m_value;
+  T *m_ptr;  // pointer to value declared elsewhere
+  T m_default, m_min, m_max, m_step;
+  int m_precision;  // number of digits after decimal point in string output
+};
+
+const Param<int> dummy("error");
+
+// list of parameters
+class ParamList : public ParamBase {
+ public:
+  ParamList(const char *name = "") : ParamBase(name) { active = true; }
+  ~ParamList() {}
+
+  float GetFloatValue() { return 0.0f; }
+  int GetIntValue() { return 0; }
+
+  void AddParam(ParamBase *param) {
+    m_params.push_back(param);
+    m_map[param->GetName()] = param;
+    m_current = m_params.begin();
+  }
+
+  // look-up parameter based on name
+  ParamBase *GetParam(char *name) {
+    ParamBase *p = m_map[name];
+
+    if (p) {
+      return p;
+    } else {
+      return (ParamBase *)&dummy;
+    }
+  }
+
+  ParamBase *GetParam(int i) { return m_params[i]; }
+
+  ParamBase *GetCurrent() { return *m_current; }
+
+  int GetSize() { return (int)m_params.size(); }
+
+  std::string GetValueString() { return m_name; }
+
+  // functions to traverse list
+  void Reset() { m_current = m_params.begin(); }
+
+  void Increment() {
+    m_current++;
+
+    if (m_current == m_params.end()) {
+      m_current = m_params.begin();
+    }
+  }
+
+  void Decrement() {
+    if (m_current == m_params.begin()) {
+      m_current = m_params.end() - 1;
+    } else {
+      m_current--;
+    }
+  }
+
+  float GetPercentage() { return 0.0f; }
+  void SetPercentage(float /*p*/) {}
+
+  void Write(std::ostream &stream) {
+    stream << m_name << '\n';
+
+    for (std::vector<ParamBase *>::const_iterator p = m_params.begin();
+         p != m_params.end(); ++p) {
+      (*p)->Write(stream);
+    }
+  }
+
+  void Read(std::istream &stream) {
+    stream >> m_name;
+
+    for (std::vector<ParamBase *>::const_iterator p = m_params.begin();
+         p != m_params.end(); ++p) {
+      (*p)->Read(stream);
+    }
+  }
+
+  bool IsList() { return true; }
+
+  void ResetAll() {
+    for (std::vector<ParamBase *>::const_iterator p = m_params.begin();
+         p != m_params.end(); ++p) {
+      (*p)->Reset();
+    }
+  }
+
+ protected:
+  bool active;
+  std::vector<ParamBase *> m_params;
+  std::map<std::string, ParamBase *> m_map;
+  std::vector<ParamBase *>::const_iterator m_current;
+};
+
+#endif
--- a/Common/paramgl.h
+++ b/Common/paramgl.h
@ -0,0 +1,307 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+   ParamListGL
+   - class derived from ParamList to do simple OpenGL rendering of a parameter
+   list sgg 8/2001
+*/
+
+#ifndef PARAMGL_H
+#define PARAMGL_H
+
+#if defined(__APPLE__) || defined(MACOSX)
+#include <GLUT/glut.h>
+#else
+#include <GL/freeglut.h>
+#endif
+
+#include <param.h>
+#include <string.h>
+
+inline void beginWinCoords(void) {
+  glMatrixMode(GL_MODELVIEW);
+  glPushMatrix();
+  glLoadIdentity();
+  glTranslatef(0.0, (GLfloat)(glutGet(GLUT_WINDOW_HEIGHT) - 1.0), 0.0);
+  glScalef(1.0, -1.0, 1.0);
+
+  glMatrixMode(GL_PROJECTION);
+  glPushMatrix();
+  glLoadIdentity();
+  glOrtho(0, glutGet(GLUT_WINDOW_WIDTH), 0, glutGet(GLUT_WINDOW_HEIGHT), -1, 1);
+
+  glMatrixMode(GL_MODELVIEW);
+}
+
+inline void endWinCoords(void) {
+  glMatrixMode(GL_PROJECTION);
+  glPopMatrix();
+
+  glMatrixMode(GL_MODELVIEW);
+  glPopMatrix();
+}
+
+inline void glPrint(int x, int y, const char *s, void *font) {
+  glRasterPos2f((GLfloat)x, (GLfloat)y);
+  int len = (int)strlen(s);
+
+  for (int i = 0; i < len; i++) {
+    glutBitmapCharacter(font, s[i]);
+  }
+}
+
+inline void glPrintShadowed(int x, int y, const char *s, void *font,
+                            float *color) {
+  glColor3f(0.0, 0.0, 0.0);
+  glPrint(x - 1, y - 1, s, font);
+
+  glColor3fv((GLfloat *)color);
+  glPrint(x, y, s, font);
+}
+
+class ParamListGL : public ParamList {
+ public:
+  ParamListGL(const char *name = "")
+      : ParamList(name),
+        m_active(true),
+        m_text_color_selected(1.0, 1.0, 1.0),
+        m_text_color_unselected(0.75, 0.75, 0.75),
+        m_text_color_shadow(0.0, 0.0, 0.0),
+        m_bar_color_outer(0.25, 0.25, 0.25),
+        m_bar_color_inner(1.0, 1.0, 1.0) {
+    m_font = (void *)GLUT_BITMAP_9_BY_15;  // GLUT_BITMAP_8_BY_13;
+    m_font_h = 15;
+    m_bar_x = 260;
+    m_bar_w = 250;
+    m_bar_h = 10;
+    m_bar_offset = 5;
+    m_text_x = 5;
+    m_separation = 15;
+    m_value_x = 200;
+    m_start_x = 0;
+    m_start_y = 0;
+  }
+
+  void Render(int x, int y, bool shadow = false) {
+    beginWinCoords();
+
+    m_start_x = x;
+    m_start_y = y;
+
+    for (std::vector<ParamBase *>::const_iterator p = m_params.begin();
+         p != m_params.end(); ++p) {
+      if ((*p)->IsList()) {
+        ParamListGL *list = (ParamListGL *)(*p);
+        list->Render(x + 10, y);
+        y += m_separation * list->GetSize();
+      } else {
+        if (p == m_current) {
+          glColor3fv(&m_text_color_selected.r);
+        } else {
+          glColor3fv(&m_text_color_unselected.r);
+        }
+
+        if (shadow) {
+          glPrintShadowed(x + m_text_x, y + m_font_h, (*p)->GetName().c_str(),
+                          m_font,
+                          (p == m_current) ? &m_text_color_selected.r
+                                           : &m_text_color_unselected.r);
+          glPrintShadowed(x + m_value_x, y + m_font_h,
+                          (*p)->GetValueString().c_str(), m_font,
+                          (p == m_current) ? &m_text_color_selected.r
+                                           : &m_text_color_unselected.r);
+        } else {
+          glPrint(x + m_text_x, y + m_font_h, (*p)->GetName().c_str(), m_font);
+          glPrint(x + m_value_x, y + m_font_h, (*p)->GetValueString().c_str(),
+                  m_font);
+        }
+
+        glColor3fv((GLfloat *)&m_bar_color_outer.r);
+        glBegin(GL_LINE_LOOP);
+        glVertex2f((GLfloat)(x + m_bar_x), (GLfloat)(y + m_bar_offset));
+        glVertex2f((GLfloat)(x + m_bar_x + m_bar_w),
+                   (GLfloat)(y + m_bar_offset));
+        glVertex2f((GLfloat)(x + m_bar_x + m_bar_w),
+                   (GLfloat)(y + m_bar_offset + m_bar_h));
+        glVertex2f((GLfloat)(x + m_bar_x),
+                   (GLfloat)(y + m_bar_offset + m_bar_h));
+        glEnd();
+
+        glColor3fv((GLfloat *)&m_bar_color_inner.r);
+        glRectf(
+            (GLfloat)(x + m_bar_x), (GLfloat)(y + m_bar_offset + m_bar_h),
+            (GLfloat)(x + m_bar_x + ((m_bar_w - 1) * (*p)->GetPercentage())),
+            (GLfloat)(y + m_bar_offset + 1));
+
+        y += m_separation;
+      }
+    }
+
+    endWinCoords();
+  }
+
+  bool Mouse(int x, int y, int button = GLUT_LEFT_BUTTON,
+             int state = GLUT_DOWN) {
+    if ((y < m_start_y) ||
+        (y > (int)(m_start_y + (m_separation * m_params.size()) - 1))) {
+      m_active = false;
+      return false;
+    }
+
+    m_active = true;
+
+    int i = (y - m_start_y) / m_separation;
+
+    if ((button == GLUT_LEFT_BUTTON) && (state == GLUT_DOWN)) {
+#if defined(__GNUC__) && (__GNUC__ < 3)
+      m_current = &m_params[i];
+#else
+
+      // MJH: workaround since the version of vector::at used here is
+      // non-standard
+      for (m_current = m_params.begin(); m_current != m_params.end() && i > 0;
+           m_current++, i--)
+        ;
+
+        // m_current = (std::vector<ParamBase
+        // *>::const_iterator)&m_params.at(i);
+#endif
+
+      if ((x > m_bar_x) && (x < m_bar_x + m_bar_w)) {
+        Motion(x, y);
+      }
+    }
+
+    return true;
+  }
+
+  bool Motion(int x, int y) {
+    if ((y < m_start_y) ||
+        (y > m_start_y + (m_separation * (int)m_params.size()) - 1)) {
+      return false;
+    }
+
+    if (x < m_bar_x) {
+      (*m_current)->SetPercentage(0.0);
+      return true;
+    }
+
+    if (x > m_bar_x + m_bar_w) {
+      (*m_current)->SetPercentage(1.0);
+      return true;
+    }
+
+    (*m_current)->SetPercentage((x - m_bar_x) / (float)m_bar_w);
+    return true;
+  }
+
+  void Special(int key, int x, int y) {
+    if (!m_active) return;
+
+    switch (key) {
+      case GLUT_KEY_DOWN:
+        Increment();
+        break;
+
+      case GLUT_KEY_UP:
+        Decrement();
+        break;
+
+      case GLUT_KEY_RIGHT:
+        GetCurrent()->Increment();
+        break;
+
+      case GLUT_KEY_LEFT:
+        GetCurrent()->Decrement();
+        break;
+
+      case GLUT_KEY_HOME:
+        GetCurrent()->Reset();
+        break;
+
+      case GLUT_KEY_END:
+        GetCurrent()->SetPercentage(1.0);
+        break;
+    }
+
+    glutPostRedisplay();
+  }
+
+  void SetFont(void *font, int height) {
+    m_font = font;
+    m_font_h = height;
+  }
+
+  void SetSelectedColor(float r, float g, float b) {
+    m_text_color_selected = Color(r, g, b);
+  }
+  void SetUnSelectedColor(float r, float g, float b) {
+    m_text_color_unselected = Color(r, g, b);
+  }
+  void SetBarColorInner(float r, float g, float b) {
+    m_bar_color_inner = Color(r, g, b);
+  }
+  void SetBarColorOuter(float r, float g, float b) {
+    m_bar_color_outer = Color(r, g, b);
+  }
+
+  void SetActive(bool b) { m_active = b; }
+
+ private:
+  void *m_font;
+  int m_font_h;  // font height
+
+  int m_bar_x;       // bar start x position
+  int m_bar_w;       // bar width
+  int m_bar_h;       // bar height
+  int m_text_x;      // text start x position
+  int m_separation;  // bar separation in y
+  int m_value_x;     // value text x position
+  int m_bar_offset;  // bar offset in y
+
+  int m_start_x, m_start_y;
+
+  bool m_active;
+
+  struct Color {
+    Color(float _r, float _g, float _b) {
+      r = _r;
+      g = _g;
+      b = _b;
+    }
+    float r, g, b;
+  };
+
+  Color m_text_color_selected;
+  Color m_text_color_unselected;
+  Color m_text_color_shadow;
+  Color m_bar_color_outer;
+  Color m_bar_color_inner;
+};
+
+#endif
--- a/Common/rendercheck_d3d10.cpp
+++ b/Common/rendercheck_d3d10.cpp
@ -0,0 +1,128 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+//
+//  Utility funcs to wrap up saving a surface or the back buffer as a PPM file
+//  In addition, wraps up a threshold comparision of two PPMs.
+//
+//  These functions are designed to be used to implement an automated QA testing
+//  for SDK samples.
+//
+//  Author: Bryan Dudash
+//  Email: sdkfeedback@nvidia.com
+//
+// Copyright (c) NVIDIA Corporation. All rights reserved.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <helper_functions.h>
+#include <rendercheck_d3d10.h>
+
+HRESULT CheckRenderD3D10::ActiveRenderTargetToPPM(ID3D10Device *pDevice,
+                                                  const char *zFileName) {
+  ID3D10RenderTargetView *pRTV = NULL;
+  pDevice->OMGetRenderTargets(1, &pRTV, NULL);
+
+  ID3D10Resource *pSourceResource = NULL;
+  pRTV->GetResource(&pSourceResource);
+
+  return ResourceToPPM(pDevice, pSourceResource, zFileName);
+}
+
+HRESULT CheckRenderD3D10::ResourceToPPM(ID3D10Device *pDevice,
+                                        ID3D10Resource *pResource,
+                                        const char *zFileName) {
+  D3D10_RESOURCE_DIMENSION rType;
+  pResource->GetType(&rType);
+
+  if (rType != D3D10_RESOURCE_DIMENSION_TEXTURE2D) {
+    printf("SurfaceToPPM: pResource is not a 2D texture! Aborting...\n");
+    return E_FAIL;
+  }
+
+  ID3D10Texture2D *pSourceTexture = (ID3D10Texture2D *)pResource;
+  ID3D10Texture2D *pTargetTexture = NULL;
+
+  D3D10_TEXTURE2D_DESC desc;
+  pSourceTexture->GetDesc(&desc);
+  desc.BindFlags = 0;
+  desc.CPUAccessFlags = D3D10_CPU_ACCESS_READ;
+  desc.Usage = D3D10_USAGE_STAGING;
+
+  if (FAILED(pDevice->CreateTexture2D(&desc, NULL, &pTargetTexture))) {
+    printf(
+        "SurfaceToPPM: Unable to create target Texture resoruce! Aborting... "
+        "\n");
+    return E_FAIL;
+  }
+
+  pDevice->CopyResource(pTargetTexture, pSourceTexture);
+
+  D3D10_MAPPED_TEXTURE2D mappedTex2D;
+  pTargetTexture->Map(0, D3D10_MAP_READ, 0, &mappedTex2D);
+
+  // Need to convert from dx pitch to pitch=width
+  unsigned char *pPPMData = new unsigned char[desc.Width * desc.Height * 4];
+
+  for (unsigned int iHeight = 0; iHeight < desc.Height; iHeight++) {
+    memcpy(
+        &(pPPMData[iHeight * desc.Width * 4]),
+        (unsigned char *)(mappedTex2D.pData) + iHeight * mappedTex2D.RowPitch,
+        desc.Width * 4);
+  }
+
+  pTargetTexture->Unmap(0);
+
+  // Prepends the PPM header info and bumps byte data afterwards
+  sdkSavePPM4ub(zFileName, pPPMData, desc.Width, desc.Height);
+
+  delete[] pPPMData;
+  pTargetTexture->Release();
+
+  return S_OK;
+}
+
+bool CheckRenderD3D10::PPMvsPPM(const char *src_file, const char *ref_file,
+                                const char *exec_path, const float epsilon,
+                                const float threshold) {
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf(
+        "CheckRenderD3D10::PPMvsPPM unable to find <%s> in <%s> Aborting "
+        "comparison!\n",
+        ref_file, exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           ref_file);
+    printf("Aborting comparison!\n");
+    printf("  FAILURE!\n");
+    return false;
+  }
+
+  return (sdkComparePPM(src_file, ref_file_path, epsilon, threshold, true) ==
+          true);
+}
--- a/Common/rendercheck_d3d10.h
+++ b/Common/rendercheck_d3d10.h
@ -0,0 +1,53 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#ifndef _RENDERCHECK_D3D10_H_
+#define _RENDERCHECK_D3D10_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <d3d10.h>
+
+class CheckRenderD3D10 {
+ public:
+  CheckRenderD3D10() {}
+
+  static HRESULT ActiveRenderTargetToPPM(ID3D10Device *pDevice,
+                                         const char *zFileName);
+  static HRESULT ResourceToPPM(ID3D10Device *pDevice, ID3D10Resource *pResource,
+                               const char *zFileName);
+
+  static bool PPMvsPPM(const char *src_file, const char *ref_file,
+                       const char *exec_path, const float epsilon,
+                       const float threshold = 0.0f);
+};
+
+#endif
--- a/Common/rendercheck_d3d11.cpp
+++ b/Common/rendercheck_d3d11.cpp
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/rendercheck_d3d11.h
+++ b/Common/rendercheck_d3d11.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
--- a/Common/rendercheck_d3d9.cpp
+++ b/Common/rendercheck_d3d9.cpp
@ -0,0 +1,167 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+//
+//  Utility funcs to wrap up savings a surface or the back buffer as a PPM file
+//  In addition, wraps up a threshold comparision of two PPMs.
+//
+//  These functions are designed to be used to implement an automated QA testing
+//  for SDK samples.
+//
+//  Author: Bryan Dudash
+//  Email: sdkfeedback@nvidia.com
+//
+// Copyright (c) NVIDIA Corporation. All rights reserved.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <helper_functions.h>
+#include <rendercheck_d3d9.h>
+
+// originally copied from checkrender_gl.cpp and slightly modified
+bool CheckRenderD3D9::PPMvsPPM(const char *src_file, const char *ref_file,
+                               const char *exec_path, const float epsilon,
+                               const float threshold) {
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf(
+        "CheckRenderD3D9::PPMvsPPM unable to find <%s> in <%s> Aborting "
+        "comparison!\n",
+        ref_file, exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           ref_file);
+    printf("Aborting comparison!\n");
+    printf("  FAILURE!\n");
+    return false;
+  }
+
+  return (sdkComparePPM(src_file, ref_file_path, epsilon, threshold, true) ==
+          true);
+};
+
+HRESULT CheckRenderD3D9::BackbufferToPPM(IDirect3DDevice9 *pDevice,
+                                         const char *zFileName) {
+  IDirect3DSurface9 *pSurface = NULL;
+
+  if (FAILED(
+          pDevice->GetBackBuffer(0, 0, D3DBACKBUFFER_TYPE_MONO, &pSurface))) {
+    printf("Unable to get the back buffer.  Aborting...\n");
+    return E_FAIL;
+  }
+
+  // D3DXSaveSurfaceToFile("C:\\bing.dds",D3DXIFF_DDS,pSurface,NULL,NULL);
+
+  HRESULT hr = S_OK;
+  hr = SurfaceToPPM(pDevice, pSurface, zFileName);
+
+  pSurface->Release();
+
+  return hr;
+}
+
+HRESULT CheckRenderD3D9::SurfaceToPPM(IDirect3DDevice9 *pDevice,
+                                      IDirect3DSurface9 *pSurface,
+                                      const char *zFileName) {
+  D3DSURFACE_DESC pDesc;
+  pSurface->GetDesc(&pDesc);
+
+  // $$ For now only support common 8bit formats.  TODO: support for more
+  // complex formats via conversion?
+  if (!(pDesc.Format == D3DFMT_A8R8G8B8 || pDesc.Format == D3DFMT_X8R8G8B8)) {
+    return E_INVALIDARG;
+  }
+
+  IDirect3DTexture9 *pTargetTex = NULL;
+
+  if (FAILED(pDevice->CreateTexture(pDesc.Width, pDesc.Height, 1,
+                                    D3DUSAGE_DYNAMIC, pDesc.Format,
+                                    D3DPOOL_SYSTEMMEM, &pTargetTex, NULL))) {
+    printf("Unable to create texture for surface transfer! Aborting...\n");
+    return E_FAIL;
+  }
+
+  IDirect3DSurface9 *pTargetSurface = NULL;
+
+  if (FAILED(pTargetTex->GetSurfaceLevel(0, &pTargetSurface))) {
+    printf("Unable to get surface for surface transfer! Aborting...\n");
+    return E_FAIL;
+  }
+
+  // This is required because we cannot lock a D3DPOOL_DEAULT surface directly.
+  // So, we copy to our sysmem surface.
+  if (FAILED(pDevice->GetRenderTargetData(pSurface, pTargetSurface))) {
+    printf(
+        "Unable to GetRenderTargetData() for surface transfer! Aborting...\n");
+    return E_FAIL;
+  }
+
+  D3DLOCKED_RECT lockedRect;
+  HRESULT hr = pTargetSurface->LockRect(&lockedRect, NULL, 0);
+
+  // Need to convert from dx pitch to pitch=width
+  //
+  // $ PPM is BGR and not RGB it seems. Saved image looks "funny" in viewer(red
+  // and blue swapped), but since ref will be dumped using same method, this is
+  // ok.
+  //      however, if we want the saved image to be properly colored, then we
+  //      can swizzle the color bytes here.
+  unsigned char *pPPMData = new unsigned char[pDesc.Width * pDesc.Height * 4];
+
+  for (unsigned int iHeight = 0; iHeight < pDesc.Height; iHeight++) {
+#if 1  // swizzle to implment RGB to BGR conversion.
+
+    for (unsigned int iWidth = 0; iWidth < pDesc.Width; iWidth++) {
+      DWORD color = *(DWORD *)((unsigned char *)(lockedRect.pBits) +
+                               iHeight * lockedRect.Pitch + iWidth * 4);
+
+      // R<->B, [7:0] <-> [23:16], swizzle
+      color = ((color & 0xFF) << 16) | (color & 0xFF00) |
+              ((color & 0xFF0000) >> 16) | (color & 0xFF000000);
+
+      memcpy(&(pPPMData[(iHeight * pDesc.Width + iWidth) * 4]),
+             (unsigned char *)&color, 4);
+    }
+
+#else
+    memcpy(&(pPPMData[iHeight * pDesc.Width * 4]),
+           (unsigned char *)(lockedRect.pBits) + iHeight * lockedRect.Pitch,
+           pDesc.Width * 4);
+#endif
+  }
+
+  pTargetSurface->UnlockRect();
+
+  // Prepends the PPM header info and bumps byte data afterwards
+  sdkSavePPM4ub(zFileName, pPPMData, pDesc.Width, pDesc.Height);
+
+  delete[] pPPMData;
+  pTargetSurface->Release();
+  pTargetTex->Release();
+
+  return S_OK;
+}
--- a/Common/rendercheck_d3d9.h
+++ b/Common/rendercheck_d3d9.h
@ -0,0 +1,54 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#ifndef _RENDERCHECK_D3D9_H_
+#define _RENDERCHECK_D3D9_H_
+
+#include <assert.h>
+#include <d3d9.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+class CheckRenderD3D9 {
+ public:
+  CheckRenderD3D9() {}
+
+  static HRESULT BackbufferToPPM(IDirect3DDevice9 *pDevice,
+                                 const char *zFileName);
+  static HRESULT SurfaceToPPM(IDirect3DDevice9 *pDevice,
+                              IDirect3DSurface9 *pSurface,
+                              const char *zFileName);
+
+  static bool PPMvsPPM(const char *src_file, const char *ref_file,
+                       const char *exec_path, const float epsilon,
+                       const float threshold = 0.0f);
+};
+
+#endif
--- a/Common/rendercheck_gl.h
+++ b/Common/rendercheck_gl.h
--- a/5
+++ b/5
@ -50,6 +50,9 @@ PROJECTS := $(filter-out $(FILTER_OUT),$(PROJECTS))
 %.ph_clobber :
 	+@$(MAKE) -C $(dir $*) clobber $(USE_DEVICE)

+%.ph_run :
+	+@$(MAKE) -C $(dir $*) run
+
 all:  $(addsuffix .ph_build,$(PROJECTS))
 	@echo "Finished building CUDA samples"

@ -62,3 +65,5 @@ tidy:
 clean: tidy $(addsuffix .ph_clean,$(PROJECTS))

 clobber: clean $(addsuffix .ph_clobber,$(PROJECTS))
+
+run: $(addsuffix .ph_run,$(PROJECTS))
--- a/README.md
+++ b/README.md
@ -1,11 +1,19 @@
 # CUDA Samples

-Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.4 update 1](https://developer.nvidia.com/cuda-downloads).
+Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.5](https://developer.nvidia.com/cuda-downloads).

 ## Release Notes

 This section describes the release notes for the CUDA Samples on GitHub only.

+### CUDA 11.5
+* Added `cuDLAHybridMode`. Demonstrate usage of cuDLA in hybrid mode.
+* Added `cuDLAStandaloneMode`. Demonstrate usage of cuDLA in standalone mode.
+* Added `cuDLAErrorReporting`. Demonstrate DLA error detection via CUDA.
+* Added `graphMemoryNodes`. Demonstrates memory allocations and frees within CUDA graphs using Graph APIs and Stream Capture APIs.
+* Added `graphMemoryFootprint`. Demonstrates how graph memory nodes re-use virtual addresses and physical memory.
+* All samples from CUDA toolkit are now available on [GitHub](https://github.com/nvidia/cuda-samples).
+
 ### CUDA 11.4 update 1
 * Added support for VS Code on linux platform.

@ -116,7 +124,7 @@ This is the first release of CUDA Samples on GitHub:

 ### Prerequisites

-Download and install the [CUDA Toolkit 11.4](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 11.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).

 ### Getting the CUDA Samples
@ -173,39 +181,104 @@ The samples makefiles can take advantage of certain options:
 ### Samples by OS

 #### Linux
-**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[shfl_scan](./Samples/shfl_scan)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | **[concurrentKernels](./Samples/concurrentKernels)** |
+**[simpleAssert_nvrtc](./Samples/simpleAssert_nvrtc)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[graphMemoryFootprint](./Samples/graphMemoryFootprint)** | **[MC_EstimatePiQ](./Samples/MC_EstimatePiQ)** |
 ---|---|---|---|
-**[streamOrderedAllocationIPC](./Samples/streamOrderedAllocationIPC)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** |
-**[nvJPEG](./Samples/nvJPEG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[boxFilterNPP](./Samples/boxFilterNPP)** |
-**[matrixMul](./Samples/matrixMul)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[simpleCUFFT](./Samples/simpleCUFFT)** |
-**[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[cudaNvSci](./Samples/cudaNvSci)** |
-**[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** |
-**[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** |
-**[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** |
-**[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[simpleGL](./Samples/simpleGL)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
-**[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[deviceQuery](./Samples/deviceQuery)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | **[matrixMulDrv](./Samples/matrixMulDrv)** |
-**[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** |
-**[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[simpleIPC](./Samples/simpleIPC)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[cudaNvSciNvMedia](./Samples/cudaNvSciNvMedia)** |
-**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[bandwidthTest](./Samples/bandwidthTest)** | **[cdpQuadtree](./Samples/cdpQuadtree)** |
-**[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[reduction](./Samples/reduction)** |
-**[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** |
+**[reductionMultiBlockCG](./Samples/reductionMultiBlockCG)** | **[cuDLAStandaloneMode](./Samples/cuDLAStandaloneMode)** | **[conjugateGradientPrecond](./Samples/conjugateGradientPrecond)** | **[ptxjit](./Samples/ptxjit)** |
+**[threadMigration](./Samples/threadMigration)** | **[EGLStream_CUDA_CrossGPU](./Samples/EGLStream_CUDA_CrossGPU)** | **[threadFenceReduction](./Samples/threadFenceReduction)** | **[simpleAtomicIntrinsics_nvrtc](./Samples/simpleAtomicIntrinsics_nvrtc)** |
+**[shfl_scan](./Samples/shfl_scan)** | **[clock](./Samples/clock)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | **[MC_EstimatePiP](./Samples/MC_EstimatePiP)** |
+**[transpose](./Samples/transpose)** | **[simpleMultiCopy](./Samples/simpleMultiCopy)** | **[cuDLAErrorReporting](./Samples/cuDLAErrorReporting)** | **[concurrentKernels](./Samples/concurrentKernels)** |
+**[streamOrderedAllocationIPC](./Samples/streamOrderedAllocationIPC)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cppIntegration](./Samples/cppIntegration)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** |
+**[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[nvJPEG](./Samples/nvJPEG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[MC_EstimatePiInlineP](./Samples/MC_EstimatePiInlineP)** |
+**[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[simpleAssert](./Samples/simpleAssert)** | **[simpleTemplates](./Samples/simpleTemplates)** |
+**[cuHook](./Samples/cuHook)** | **[simpleCUDA2GL](./Samples/simpleCUDA2GL)** | **[matrixMul](./Samples/matrixMul)** | **[quasirandomGenerator_nvrtc](./Samples/quasirandomGenerator_nvrtc)** |
+**[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[simpleTextureDrv](./Samples/simpleTextureDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** |
+**[simpleCallback](./Samples/simpleCallback)** | **[batchCUBLAS](./Samples/batchCUBLAS)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleAtomicIntrinsics](./Samples/simpleAtomicIntrinsics)** |
+**[newdelete](./Samples/newdelete)** | **[bicubicTexture](./Samples/bicubicTexture)** | **[dxtc](./Samples/dxtc)** | **[cudaOpenMP](./Samples/cudaOpenMP)** |
+**[cdpBezierTessellation](./Samples/cdpBezierTessellation)** | **[randomFog](./Samples/randomFog)** | **[bilateralFilter](./Samples/bilateralFilter)** | **[conjugateGradient](./Samples/conjugateGradient)** |
+**[particles](./Samples/particles)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[Mandelbrot](./Samples/Mandelbrot)** | **[binomialOptions_nvrtc](./Samples/binomialOptions_nvrtc)** |
+**[cudaNvSci](./Samples/cudaNvSci)** | **[mergeSort](./Samples/mergeSort)** | **[HSOpticalFlow](./Samples/HSOpticalFlow)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** |
+**[convolutionTexture](./Samples/convolutionTexture)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** |
+**[eigenvalues](./Samples/eigenvalues)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[cuSolverSp_LowlevelCholesky](./Samples/cuSolverSp_LowlevelCholesky)** | **[topologyQuery](./Samples/topologyQuery)** |
+**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[volumeRender](./Samples/volumeRender)** | **[stereoDisparity](./Samples/stereoDisparity)** | **[simpleTexture](./Samples/simpleTexture)** |
+**[simpleStreams](./Samples/simpleStreams)** | **[smokeParticles](./Samples/smokeParticles)** | **[simpleMultiGPU](./Samples/simpleMultiGPU)** | **[deviceQueryDrv](./Samples/deviceQueryDrv)** |
+**[fastWalshTransform](./Samples/fastWalshTransform)** | **[quasirandomGenerator](./Samples/quasirandomGenerator)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** |
+**[conjugateGradientUM](./Samples/conjugateGradientUM)** | **[simpleVoteIntrinsics_nvrtc](./Samples/simpleVoteIntrinsics_nvrtc)** | **[simpleLayeredTexture](./Samples/simpleLayeredTexture)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** |
+**[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[matrixMulCUBLAS](./Samples/matrixMulCUBLAS)** | **[histEqualizationNPP](./Samples/histEqualizationNPP)** |
+**[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[recursiveGaussian](./Samples/recursiveGaussian)** | **[imageDenoising](./Samples/imageDenoising)** | **[FunctionPointers](./Samples/FunctionPointers)** |
+**[simpleGL](./Samples/simpleGL)** | **[segmentationTreeThrust](./Samples/segmentationTreeThrust)** | **[scalarProd](./Samples/scalarProd)** | **[SobolQRNG](./Samples/SobolQRNG)** |
+**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[simplePitchLinearTexture](./Samples/simplePitchLinearTexture)** | **[freeImageInteropNPP](./Samples/freeImageInteropNPP)** |
+**[template](./Samples/template)** | **[dwtHaar1D](./Samples/dwtHaar1D)** | **[postProcessGL](./Samples/postProcessGL)** | **[BlackScholes](./Samples/BlackScholes)** |
+**[volumeFiltering](./Samples/volumeFiltering)** | **[simpleCUFFT_callback](./Samples/simpleCUFFT_callback)** | **[UnifiedMemoryStreams](./Samples/UnifiedMemoryStreams)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** |
+**[deviceQuery](./Samples/deviceQuery)** | **[simpleHyperQ](./Samples/simpleHyperQ)** | **[systemWideAtomics](./Samples/systemWideAtomics)** | **[cuSolverSp_LowlevelQR](./Samples/cuSolverSp_LowlevelQR)** |
+**[inlinePTX](./Samples/inlinePTX)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[EGLStream_CUDA_Interop](./Samples/EGLStream_CUDA_Interop)** | **[cuDLAHybridMode](./Samples/cuDLAHybridMode)** |
+**[asyncAPI](./Samples/asyncAPI)** | **[MC_EstimatePiInlineQ](./Samples/MC_EstimatePiInlineQ)** | **[scan](./Samples/scan)** | **[simpleCooperativeGroups](./Samples/simpleCooperativeGroups)** |
+**[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleTemplates_nvrtc](./Samples/simpleTemplates_nvrtc)** | **[simpleTexture3D](./Samples/simpleTexture3D)** | **[lineOfSight](./Samples/lineOfSight)** |
+**[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[binomialOptions](./Samples/binomialOptions)** | **[simpleAttributes](./Samples/simpleAttributes)** | **[bindlessTexture](./Samples/bindlessTexture)** |
+**[simpleCUFFT_2d_MGPU](./Samples/simpleCUFFT_2d_MGPU)** | **[simplePrintf](./Samples/simplePrintf)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[histogram](./Samples/histogram)** |
+**[matrixMulDynlinkJIT](./Samples/matrixMulDynlinkJIT)** | **[simpleP2P](./Samples/simpleP2P)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[vectorAddDrv](./Samples/vectorAddDrv)** |
+**[sortingNetworks](./Samples/sortingNetworks)** | **[alignedTypes](./Samples/alignedTypes)** | **[inlinePTX_nvrtc](./Samples/inlinePTX_nvrtc)** | **[simpleCubemapTexture](./Samples/simpleCubemapTexture)** |
+**[simpleIPC](./Samples/simpleIPC)** | **[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[radixSortThrust](./Samples/radixSortThrust)** | **[MonteCarloMultiGPU](./Samples/MonteCarloMultiGPU)** |
+**[cudaNvSciNvMedia](./Samples/cudaNvSciNvMedia)** | **[vectorAdd](./Samples/vectorAdd)** | **[cdpSimplePrint](./Samples/cdpSimplePrint)** | **[FilterBorderControlNPP](./Samples/FilterBorderControlNPP)** |
+**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[convolutionSeparable](./Samples/convolutionSeparable)** | **[nbody](./Samples/nbody)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** |
+**[simpleSeparateCompilation](./Samples/simpleSeparateCompilation)** | **[c++11_cuda](./Samples/c++11_cuda)** | **[fluidsGL](./Samples/fluidsGL)** | **[bandwidthTest](./Samples/bandwidthTest)** |
+**[clock_nvrtc](./Samples/clock_nvrtc)** | **[graphMemoryNodes](./Samples/graphMemoryNodes)** | **[cdpQuadtree](./Samples/cdpQuadtree)** | **[interval](./Samples/interval)** |
+**[boxFilter](./Samples/boxFilter)** | **[matrixMul_nvrtc](./Samples/matrixMul_nvrtc)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[cppOverload](./Samples/cppOverload)** |
+**[marchingCubes](./Samples/marchingCubes)** | **[cuSolverRf](./Samples/cuSolverRf)** | **[BlackScholes_nvrtc](./Samples/BlackScholes_nvrtc)** | **[cdpAdvancedQuicksort](./Samples/cdpAdvancedQuicksort)** |
+**[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[cdpSimpleQuicksort](./Samples/cdpSimpleQuicksort)** | **[simpleOccupancy](./Samples/simpleOccupancy)** | **[simpleSurfaceWrite](./Samples/simpleSurfaceWrite)** |
+**[simpleCUFFT_MGPU](./Samples/simpleCUFFT_MGPU)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[convolutionFFT2D](./Samples/convolutionFFT2D)** | **[reduction](./Samples/reduction)** |
+**[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[SobelFilter](./Samples/SobelFilter)** | **[dct8x8](./Samples/dct8x8)** | **[fp16ScalarProduct](./Samples/fp16ScalarProduct)** |
+**[FDTD3d](./Samples/FDTD3d)** | **[oceanFFT](./Samples/oceanFFT)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[StreamPriorities](./Samples/StreamPriorities)** |
+**[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** | **[MC_SingleAsianOptionP](./Samples/MC_SingleAsianOptionP)** | **[simpleMPI](./Samples/simpleMPI)** |

 #### Windows
-**[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[shfl_scan](./Samples/shfl_scan)** | **[binaryPartitionCG](./Samples/binaryPartitionCG)** | **[concurrentKernels](./Samples/concurrentKernels)** |
+**[simpleAssert_nvrtc](./Samples/simpleAssert_nvrtc)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[graphMemoryFootprint](./Samples/graphMemoryFootprint)** | **[MC_EstimatePiQ](./Samples/MC_EstimatePiQ)** |
 ---|---|---|---|
-**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** | **[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[nvJPEG](./Samples/nvJPEG)** |
-**[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[boxFilterNPP](./Samples/boxFilterNPP)** | **[matrixMul](./Samples/matrixMul)** |
-**[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** |
-**[cudaOpenMP](./Samples/cudaOpenMP)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** |
-**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** |
-**[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** | **[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** |
-**[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[simpleGL](./Samples/simpleGL)** | **[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** |
-**[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[simpleD3D11](./Samples/simpleD3D11)** | **[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[deviceQuery](./Samples/deviceQuery)** |
-**[matrixMulDrv](./Samples/matrixMulDrv)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[simpleAttributes](./Samples/simpleAttributes)** |
-**[simpleD3D12](./Samples/simpleD3D12)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[simpleIPC](./Samples/simpleIPC)** |
-**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** | **[bandwidthTest](./Samples/bandwidthTest)** |
-**[cdpQuadtree](./Samples/cdpQuadtree)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** |
-**[reduction](./Samples/reduction)** | **[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** |
+**[reductionMultiBlockCG](./Samples/reductionMultiBlockCG)** | **[conjugateGradientPrecond](./Samples/conjugateGradientPrecond)** | **[ptxjit](./Samples/ptxjit)** | **[threadMigration](./Samples/threadMigration)** |
+**[threadFenceReduction](./Samples/threadFenceReduction)** | **[simpleAtomicIntrinsics_nvrtc](./Samples/simpleAtomicIntrinsics_nvrtc)** | **[shfl_scan](./Samples/shfl_scan)** | **[clock](./Samples/clock)** |
+**[binaryPartitionCG](./Samples/binaryPartitionCG)** | **[MC_EstimatePiP](./Samples/MC_EstimatePiP)** | **[transpose](./Samples/transpose)** | **[simpleMultiCopy](./Samples/simpleMultiCopy)** |
+**[concurrentKernels](./Samples/concurrentKernels)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[cppIntegration](./Samples/cppIntegration)** | **[bf16TensorCoreGemm](./Samples/bf16TensorCoreGemm)** |
+**[cuSolverSp_LinearSolver](./Samples/cuSolverSp_LinearSolver)** | **[nvJPEG](./Samples/nvJPEG)** | **[watershedSegmentationNPP](./Samples/watershedSegmentationNPP)** | **[MC_EstimatePiInlineP](./Samples/MC_EstimatePiInlineP)** |
+**[simpleD3D10](./Samples/simpleD3D10)** | **[vulkanImageCUDA](./Samples/vulkanImageCUDA)** | **[fluidsD3D9](./Samples/fluidsD3D9)** | **[boxFilterNPP](./Samples/boxFilterNPP)** |
+**[simpleAssert](./Samples/simpleAssert)** | **[simpleTemplates](./Samples/simpleTemplates)** | **[simpleCUDA2GL](./Samples/simpleCUDA2GL)** | **[matrixMul](./Samples/matrixMul)** |
+**[quasirandomGenerator_nvrtc](./Samples/quasirandomGenerator_nvrtc)** | **[nvJPEG_encoder](./Samples/nvJPEG_encoder)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[simpleTextureDrv](./Samples/simpleTextureDrv)** |
+**[simpleCUFFT](./Samples/simpleCUFFT)** | **[simpleCallback](./Samples/simpleCallback)** | **[SLID3D10Texture](./Samples/SLID3D10Texture)** | **[batchCUBLAS](./Samples/batchCUBLAS)** |
+**[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleAtomicIntrinsics](./Samples/simpleAtomicIntrinsics)** | **[newdelete](./Samples/newdelete)** | **[bicubicTexture](./Samples/bicubicTexture)** |
+**[dxtc](./Samples/dxtc)** | **[cudaOpenMP](./Samples/cudaOpenMP)** | **[cdpBezierTessellation](./Samples/cdpBezierTessellation)** | **[randomFog](./Samples/randomFog)** |
+**[bilateralFilter](./Samples/bilateralFilter)** | **[conjugateGradient](./Samples/conjugateGradient)** | **[particles](./Samples/particles)** | **[NV12toBGRandResize](./Samples/NV12toBGRandResize)** |
+**[Mandelbrot](./Samples/Mandelbrot)** | **[binomialOptions_nvrtc](./Samples/binomialOptions_nvrtc)** | **[simpleD3D10RenderTarget](./Samples/simpleD3D10RenderTarget)** | **[mergeSort](./Samples/mergeSort)** |
+**[HSOpticalFlow](./Samples/HSOpticalFlow)** | **[immaTensorCoreGemm](./Samples/immaTensorCoreGemm)** | **[convolutionTexture](./Samples/convolutionTexture)** | **[simpleVulkan](./Samples/simpleVulkan)** |
+**[simpleD3D9Texture](./Samples/simpleD3D9Texture)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[jacobiCudaGraphs](./Samples/jacobiCudaGraphs)** | **[eigenvalues](./Samples/eigenvalues)** |
+**[simpleD3D10Texture](./Samples/simpleD3D10Texture)** | **[streamOrderedAllocationP2P](./Samples/streamOrderedAllocationP2P)** | **[cuSolverSp_LowlevelCholesky](./Samples/cuSolverSp_LowlevelCholesky)** | **[topologyQuery](./Samples/topologyQuery)** |
+**[dmmaTensorCoreGemm](./Samples/dmmaTensorCoreGemm)** | **[volumeRender](./Samples/volumeRender)** | **[stereoDisparity](./Samples/stereoDisparity)** | **[simpleTexture](./Samples/simpleTexture)** |
+**[simpleStreams](./Samples/simpleStreams)** | **[smokeParticles](./Samples/smokeParticles)** | **[simpleMultiGPU](./Samples/simpleMultiGPU)** | **[deviceQueryDrv](./Samples/deviceQueryDrv)** |
+**[fastWalshTransform](./Samples/fastWalshTransform)** | **[quasirandomGenerator](./Samples/quasirandomGenerator)** | **[vectorAddMMAP](./Samples/vectorAddMMAP)** | **[MersenneTwisterGP11213](./Samples/MersenneTwisterGP11213)** |
+**[conjugateGradientUM](./Samples/conjugateGradientUM)** | **[simpleVoteIntrinsics_nvrtc](./Samples/simpleVoteIntrinsics_nvrtc)** | **[simpleLayeredTexture](./Samples/simpleLayeredTexture)** | **[streamOrderedAllocation](./Samples/streamOrderedAllocation)** |
+**[cuSolverDn_LinearSolver](./Samples/cuSolverDn_LinearSolver)** | **[cudaCompressibleMemory](./Samples/cudaCompressibleMemory)** | **[matrixMulCUBLAS](./Samples/matrixMulCUBLAS)** | **[histEqualizationNPP](./Samples/histEqualizationNPP)** |
+**[simpleAWBarrier](./Samples/simpleAWBarrier)** | **[recursiveGaussian](./Samples/recursiveGaussian)** | **[imageDenoising](./Samples/imageDenoising)** | **[FunctionPointers](./Samples/FunctionPointers)** |
+**[simpleGL](./Samples/simpleGL)** | **[segmentationTreeThrust](./Samples/segmentationTreeThrust)** | **[scalarProd](./Samples/scalarProd)** | **[SobolQRNG](./Samples/SobolQRNG)** |
+**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[simplePitchLinearTexture](./Samples/simplePitchLinearTexture)** | **[freeImageInteropNPP](./Samples/freeImageInteropNPP)** |
+**[template](./Samples/template)** | **[dwtHaar1D](./Samples/dwtHaar1D)** | **[simpleD3D11Texture](./Samples/simpleD3D11Texture)** | **[postProcessGL](./Samples/postProcessGL)** |
+**[BlackScholes](./Samples/BlackScholes)** | **[simpleD3D11](./Samples/simpleD3D11)** | **[volumeFiltering](./Samples/volumeFiltering)** | **[UnifiedMemoryStreams](./Samples/UnifiedMemoryStreams)** |
+**[globalToShmemAsyncCopy](./Samples/globalToShmemAsyncCopy)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleHyperQ](./Samples/simpleHyperQ)** | **[cuSolverSp_LowlevelQR](./Samples/cuSolverSp_LowlevelQR)** |
+**[inlinePTX](./Samples/inlinePTX)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[asyncAPI](./Samples/asyncAPI)** | **[MC_EstimatePiInlineQ](./Samples/MC_EstimatePiInlineQ)** |
+**[scan](./Samples/scan)** | **[simpleCooperativeGroups](./Samples/simpleCooperativeGroups)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[simpleTemplates_nvrtc](./Samples/simpleTemplates_nvrtc)** |
+**[simpleTexture3D](./Samples/simpleTexture3D)** | **[lineOfSight](./Samples/lineOfSight)** | **[simpleZeroCopy](./Samples/simpleZeroCopy)** | **[binomialOptions](./Samples/binomialOptions)** |
+**[simpleAttributes](./Samples/simpleAttributes)** | **[bindlessTexture](./Samples/bindlessTexture)** | **[simpleD3D12](./Samples/simpleD3D12)** | **[simpleCUFFT_2d_MGPU](./Samples/simpleCUFFT_2d_MGPU)** |
+**[simplePrintf](./Samples/simplePrintf)** | **[batchedLabelMarkersAndLabelCompressionNPP](./Samples/batchedLabelMarkersAndLabelCompressionNPP)** | **[histogram](./Samples/histogram)** | **[matrixMulDynlinkJIT](./Samples/matrixMulDynlinkJIT)** |
+**[simpleP2P](./Samples/simpleP2P)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[vectorAddDrv](./Samples/vectorAddDrv)** | **[sortingNetworks](./Samples/sortingNetworks)** |
+**[alignedTypes](./Samples/alignedTypes)** | **[inlinePTX_nvrtc](./Samples/inlinePTX_nvrtc)** | **[simpleCubemapTexture](./Samples/simpleCubemapTexture)** | **[simpleIPC](./Samples/simpleIPC)** |
+**[simpleVulkanMMAP](./Samples/simpleVulkanMMAP)** | **[radixSortThrust](./Samples/radixSortThrust)** | **[MonteCarloMultiGPU](./Samples/MonteCarloMultiGPU)** | **[vectorAdd](./Samples/vectorAdd)** |
+**[VFlockingD3D10](./Samples/VFlockingD3D10)** | **[simpleD3D9](./Samples/simpleD3D9)** | **[cdpSimplePrint](./Samples/cdpSimplePrint)** | **[FilterBorderControlNPP](./Samples/FilterBorderControlNPP)** |
+**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[convolutionSeparable](./Samples/convolutionSeparable)** | **[nbody](./Samples/nbody)** | **[simpleCUBLAS_LU](./Samples/simpleCUBLAS_LU)** |
+**[simpleSeparateCompilation](./Samples/simpleSeparateCompilation)** | **[c++11_cuda](./Samples/c++11_cuda)** | **[fluidsGL](./Samples/fluidsGL)** | **[bandwidthTest](./Samples/bandwidthTest)** |
+**[clock_nvrtc](./Samples/clock_nvrtc)** | **[graphMemoryNodes](./Samples/graphMemoryNodes)** | **[cdpQuadtree](./Samples/cdpQuadtree)** | **[interval](./Samples/interval)** |
+**[boxFilter](./Samples/boxFilter)** | **[matrixMul_nvrtc](./Samples/matrixMul_nvrtc)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[cppOverload](./Samples/cppOverload)** |
+**[marchingCubes](./Samples/marchingCubes)** | **[cuSolverRf](./Samples/cuSolverRf)** | **[BlackScholes_nvrtc](./Samples/BlackScholes_nvrtc)** | **[cdpAdvancedQuicksort](./Samples/cdpAdvancedQuicksort)** |
+**[tf32TensorCoreGemm](./Samples/tf32TensorCoreGemm)** | **[cdpSimpleQuicksort](./Samples/cdpSimpleQuicksort)** | **[simpleOccupancy](./Samples/simpleOccupancy)** | **[simpleSurfaceWrite](./Samples/simpleSurfaceWrite)** |
+**[simpleCUFFT_MGPU](./Samples/simpleCUFFT_MGPU)** | **[simpleDrvRuntime](./Samples/simpleDrvRuntime)** | **[convolutionFFT2D](./Samples/convolutionFFT2D)** | **[reduction](./Samples/reduction)** |
+**[memMapIPCDrv](./Samples/memMapIPCDrv)** | **[SobelFilter](./Samples/SobelFilter)** | **[dct8x8](./Samples/dct8x8)** | **[fp16ScalarProduct](./Samples/fp16ScalarProduct)** |
+**[FDTD3d](./Samples/FDTD3d)** | **[oceanFFT](./Samples/oceanFFT)** | **[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[cannyEdgeDetectorNPP](./Samples/cannyEdgeDetectorNPP)** |
+**[MC_SingleAsianOptionP](./Samples/MC_SingleAsianOptionP)** | **[simpleMPI](./Samples/simpleMPI)** |

 ## Dependencies

@ -374,5 +447,5 @@ Answers to frequently asked questions about CUDA can be found at http://develope
 ## References

 *   [CUDA Programming Guide](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html)
-*   [Accelerated Computing Blog](https://devblogs.nvidia.com/category/accelerated-computing/)
+*   [Accelerated Computing Blog](https://developer.nvidia.com/blog/?tags=accelerated-computing)

--- a/Samples/BlackScholes/.vscode/c_cpp_properties.json
+++ b/Samples/BlackScholes/.vscode/c_cpp_properties.json
@ -0,0 +1,18 @@
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/**",
+                "${workspaceFolder}/../../Common"
+            ],
+            "defines": [],
+            "compilerPath": "/usr/local/cuda/bin/nvcc",
+            "cStandard": "gnu17",
+            "cppStandard": "gnu++14",
+            "intelliSenseMode": "linux-gcc-x64",
+            "configurationProvider": "ms-vscode.makefile-tools"
+        }
+    ],
+    "version": 4
+}
--- a/Samples/BlackScholes/.vscode/extensions.json
+++ b/Samples/BlackScholes/.vscode/extensions.json
@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cpptools",
+        "ms-vscode.makefile-tools"
+    ]
+}
--- a/Samples/BlackScholes/.vscode/launch.json
+++ b/Samples/BlackScholes/.vscode/launch.json
@ -0,0 +1,10 @@
+{
+    "configurations": [
+        {
+            "name": "CUDA C++: Launch",
+            "type": "cuda-gdb",
+            "request": "launch",
+            "program": "${workspaceFolder}/BlackScholes"
+        }
+    ]
+}
--- a/Samples/BlackScholes/.vscode/tasks.json
+++ b/Samples/BlackScholes/.vscode/tasks.json
@ -0,0 +1,15 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "sample",
+            "type": "shell",
+            "command": "make dbg=1",
+            "problemMatcher": ["$nvcc"],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
--- a/Samples/BlackScholes/BlackScholes.cu
+++ b/Samples/BlackScholes/BlackScholes.cu
@ -0,0 +1,243 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample evaluates fair call and put prices for a
+ * given set of European options by Black-Scholes formula.
+ * See supplied whitepaper for more explanations.
+ */
+
+#include <helper_functions.h>  // helper functions for string parsing
+#include <helper_cuda.h>  // helper functions CUDA error checking and initialization
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of optN options on CPU
+////////////////////////////////////////////////////////////////////////////////
+extern "C" void BlackScholesCPU(float *h_CallResult, float *h_PutResult,
+                                float *h_StockPrice, float *h_OptionStrike,
+                                float *h_OptionYears, float Riskfree,
+                                float Volatility, int optN);
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of OptN options on GPU
+////////////////////////////////////////////////////////////////////////////////
+#include "BlackScholes_kernel.cuh"
+
+////////////////////////////////////////////////////////////////////////////////
+// Helper function, returning uniformly distributed
+// random float in [low, high] range
+////////////////////////////////////////////////////////////////////////////////
+float RandFloat(float low, float high) {
+  float t = (float)rand() / (float)RAND_MAX;
+  return (1.0f - t) * low + t * high;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Data configuration
+////////////////////////////////////////////////////////////////////////////////
+const int OPT_N = 4000000;
+const int NUM_ITERATIONS = 512;
+
+const int OPT_SZ = OPT_N * sizeof(float);
+const float RISKFREE = 0.02f;
+const float VOLATILITY = 0.30f;
+
+#define DIV_UP(a, b) (((a) + (b)-1) / (b))
+
+////////////////////////////////////////////////////////////////////////////////
+// Main program
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+  // Start logs
+  printf("[%s] - Starting...\n", argv[0]);
+
+  //'h_' prefix - CPU (host) memory space
+  float
+      // Results calculated by CPU for reference
+      *h_CallResultCPU,
+      *h_PutResultCPU,
+      // CPU copy of GPU results
+      *h_CallResultGPU, *h_PutResultGPU,
+      // CPU instance of input data
+      *h_StockPrice, *h_OptionStrike, *h_OptionYears;
+
+  //'d_' prefix - GPU (device) memory space
+  float
+      // Results calculated by GPU
+      *d_CallResult,
+      *d_PutResult,
+      // GPU instance of input data
+      *d_StockPrice, *d_OptionStrike, *d_OptionYears;
+
+  double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime;
+
+  StopWatchInterface *hTimer = NULL;
+  int i;
+
+  findCudaDevice(argc, (const char **)argv);
+
+  sdkCreateTimer(&hTimer);
+
+  printf("Initializing data...\n");
+  printf("...allocating CPU memory for options.\n");
+  h_CallResultCPU = (float *)malloc(OPT_SZ);
+  h_PutResultCPU = (float *)malloc(OPT_SZ);
+  h_CallResultGPU = (float *)malloc(OPT_SZ);
+  h_PutResultGPU = (float *)malloc(OPT_SZ);
+  h_StockPrice = (float *)malloc(OPT_SZ);
+  h_OptionStrike = (float *)malloc(OPT_SZ);
+  h_OptionYears = (float *)malloc(OPT_SZ);
+
+  printf("...allocating GPU memory for options.\n");
+  checkCudaErrors(cudaMalloc((void **)&d_CallResult, OPT_SZ));
+  checkCudaErrors(cudaMalloc((void **)&d_PutResult, OPT_SZ));
+  checkCudaErrors(cudaMalloc((void **)&d_StockPrice, OPT_SZ));
+  checkCudaErrors(cudaMalloc((void **)&d_OptionStrike, OPT_SZ));
+  checkCudaErrors(cudaMalloc((void **)&d_OptionYears, OPT_SZ));
+
+  printf("...generating input data in CPU mem.\n");
+  srand(5347);
+
+  // Generate options set
+  for (i = 0; i < OPT_N; i++) {
+    h_CallResultCPU[i] = 0.0f;
+    h_PutResultCPU[i] = -1.0f;
+    h_StockPrice[i] = RandFloat(5.0f, 30.0f);
+    h_OptionStrike[i] = RandFloat(1.0f, 100.0f);
+    h_OptionYears[i] = RandFloat(0.25f, 10.0f);
+  }
+
+  printf("...copying input data to GPU mem.\n");
+  // Copy options data to GPU memory for further processing
+  checkCudaErrors(
+      cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ,
+                             cudaMemcpyHostToDevice));
+  checkCudaErrors(
+      cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice));
+  printf("Data init done.\n\n");
+
+  printf("Executing Black-Scholes GPU kernel (%i iterations)...\n",
+         NUM_ITERATIONS);
+  checkCudaErrors(cudaDeviceSynchronize());
+  sdkResetTimer(&hTimer);
+  sdkStartTimer(&hTimer);
+
+  for (i = 0; i < NUM_ITERATIONS; i++) {
+    BlackScholesGPU<<<DIV_UP((OPT_N / 2), 128), 128 /*480, 128*/>>>(
+        (float2 *)d_CallResult, (float2 *)d_PutResult, (float2 *)d_StockPrice,
+        (float2 *)d_OptionStrike, (float2 *)d_OptionYears, RISKFREE, VOLATILITY,
+        OPT_N);
+    getLastCudaError("BlackScholesGPU() execution failed\n");
+  }
+
+  checkCudaErrors(cudaDeviceSynchronize());
+  sdkStopTimer(&hTimer);
+  gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;
+
+  // Both call and put is calculated
+  printf("Options count             : %i     \n", 2 * OPT_N);
+  printf("BlackScholesGPU() time    : %f msec\n", gpuTime);
+  printf("Effective memory bandwidth: %f GB/s\n",
+         ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3));
+  printf("Gigaoptions per second    : %f     \n\n",
+         ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3));
+
+  printf(
+      "BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u "
+      "options, NumDevsUsed = %u, Workgroup = %u\n",
+      (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime * 1e-3,
+      (2 * OPT_N), 1, 128);
+
+  printf("\nReading back GPU results...\n");
+  // Read back GPU results to compare them to CPU results
+  checkCudaErrors(cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ,
+                             cudaMemcpyDeviceToHost));
+  checkCudaErrors(
+      cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost));
+
+  printf("Checking the results...\n");
+  printf("...running CPU calculations.\n\n");
+  // Calculate options values on CPU
+  BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike,
+                  h_OptionYears, RISKFREE, VOLATILITY, OPT_N);
+
+  printf("Comparing the results...\n");
+  // Calculate max absolute difference and L1 distance
+  // between CPU and GPU results
+  sum_delta = 0;
+  sum_ref = 0;
+  max_delta = 0;
+
+  for (i = 0; i < OPT_N; i++) {
+    ref = h_CallResultCPU[i];
+    delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]);
+
+    if (delta > max_delta) {
+      max_delta = delta;
+    }
+
+    sum_delta += delta;
+    sum_ref += fabs(ref);
+  }
+
+  L1norm = sum_delta / sum_ref;
+  printf("L1 norm: %E\n", L1norm);
+  printf("Max absolute error: %E\n\n", max_delta);
+
+  printf("Shutting down...\n");
+  printf("...releasing GPU memory.\n");
+  checkCudaErrors(cudaFree(d_OptionYears));
+  checkCudaErrors(cudaFree(d_OptionStrike));
+  checkCudaErrors(cudaFree(d_StockPrice));
+  checkCudaErrors(cudaFree(d_PutResult));
+  checkCudaErrors(cudaFree(d_CallResult));
+
+  printf("...releasing CPU memory.\n");
+  free(h_OptionYears);
+  free(h_OptionStrike);
+  free(h_StockPrice);
+  free(h_PutResultGPU);
+  free(h_CallResultGPU);
+  free(h_PutResultCPU);
+  free(h_CallResultCPU);
+  sdkDeleteTimer(&hTimer);
+  printf("Shutdown done.\n");
+
+  printf("\n[BlackScholes] - Test Summary\n");
+
+  if (L1norm > 1e-6) {
+    printf("Test failed!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  printf(
+      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
+      "Results may vary when GPU Boost is enabled.\n\n");
+  printf("Test passed\n");
+  exit(EXIT_SUCCESS);
+}
--- a/Samples/BlackScholes/BlackScholes_gold.cpp
+++ b/Samples/BlackScholes/BlackScholes_gold.cpp
@ -0,0 +1,86 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// Polynomial approximation of cumulative normal distribution function
+////////////////////////////////////////////////////////////////////////////////
+static double CND(double d) {
+  const double A1 = 0.31938153;
+  const double A2 = -0.356563782;
+  const double A3 = 1.781477937;
+  const double A4 = -1.821255978;
+  const double A5 = 1.330274429;
+  const double RSQRT2PI = 0.39894228040143267793994605993438;
+
+  double K = 1.0 / (1.0 + 0.2316419 * fabs(d));
+
+  double cnd = RSQRT2PI * exp(-0.5 * d * d) *
+               (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+
+  if (d > 0) cnd = 1.0 - cnd;
+
+  return cnd;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Black-Scholes formula for both call and put
+////////////////////////////////////////////////////////////////////////////////
+static void BlackScholesBodyCPU(float &callResult, float &putResult,
+                                float Sf,  // Stock price
+                                float Xf,  // Option strike
+                                float Tf,  // Option years
+                                float Rf,  // Riskless rate
+                                float Vf  // Volatility rate
+                                ) {
+  double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf;
+
+  double sqrtT = sqrt(T);
+  double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
+  double d2 = d1 - V * sqrtT;
+  double CNDD1 = CND(d1);
+  double CNDD2 = CND(d2);
+
+  // Calculate Call and Put simultaneously
+  double expRT = exp(-R * T);
+  callResult = (float)(S * CNDD1 - X * expRT * CNDD2);
+  putResult = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of optN options
+////////////////////////////////////////////////////////////////////////////////
+extern "C" void BlackScholesCPU(float *h_CallResult, float *h_PutResult,
+                                float *h_StockPrice, float *h_OptionStrike,
+                                float *h_OptionYears, float Riskfree,
+                                float Volatility, int optN) {
+  for (int opt = 0; opt < optN; opt++)
+    BlackScholesBodyCPU(h_CallResult[opt], h_PutResult[opt], h_StockPrice[opt],
+                        h_OptionStrike[opt], h_OptionYears[opt], Riskfree,
+                        Volatility);
+}
--- a/Samples/BlackScholes/BlackScholes_kernel.cuh
+++ b/Samples/BlackScholes/BlackScholes_kernel.cuh
@ -0,0 +1,106 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// Polynomial approximation of cumulative normal distribution function
+////////////////////////////////////////////////////////////////////////////////
+__device__ inline float cndGPU(float d) {
+  const float A1 = 0.31938153f;
+  const float A2 = -0.356563782f;
+  const float A3 = 1.781477937f;
+  const float A4 = -1.821255978f;
+  const float A5 = 1.330274429f;
+  const float RSQRT2PI = 0.39894228040143267793994605993438f;
+
+  float K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));
+
+  float cnd = RSQRT2PI * __expf(-0.5f * d * d) *
+              (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+
+  if (d > 0) cnd = 1.0f - cnd;
+
+  return cnd;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Black-Scholes formula for both call and put
+////////////////////////////////////////////////////////////////////////////////
+__device__ inline void BlackScholesBodyGPU(float &CallResult, float &PutResult,
+                                           float S,  // Stock price
+                                           float X,  // Option strike
+                                           float T,  // Option years
+                                           float R,  // Riskless rate
+                                           float V  // Volatility rate
+                                           ) {
+  float sqrtT, expRT;
+  float d1, d2, CNDD1, CNDD2;
+
+  sqrtT = __fdividef(1.0F, rsqrtf(T));
+  d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
+  d2 = d1 - V * sqrtT;
+
+  CNDD1 = cndGPU(d1);
+  CNDD2 = cndGPU(d2);
+
+  // Calculate Call and Put simultaneously
+  expRT = __expf(-R * T);
+  CallResult = S * CNDD1 - X * expRT * CNDD2;
+  PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of optN options on GPU
+////////////////////////////////////////////////////////////////////////////////
+__launch_bounds__(128) __global__
+    void BlackScholesGPU(float2 *__restrict d_CallResult,
+                         float2 *__restrict d_PutResult,
+                         float2 *__restrict d_StockPrice,
+                         float2 *__restrict d_OptionStrike,
+                         float2 *__restrict d_OptionYears, float Riskfree,
+                         float Volatility, int optN) {
+  ////Thread index
+  // const int      tid = blockDim.x * blockIdx.x + threadIdx.x;
+  ////Total number of threads in execution grid
+  // const int THREAD_N = blockDim.x * gridDim.x;
+
+  const int opt = blockDim.x * blockIdx.x + threadIdx.x;
+
+  // Calculating 2 options per thread to increase ILP (instruction level
+  // parallelism)
+  if (opt < (optN / 2)) {
+    float callResult1, callResult2;
+    float putResult1, putResult2;
+    BlackScholesBodyGPU(callResult1, putResult1, d_StockPrice[opt].x,
+                        d_OptionStrike[opt].x, d_OptionYears[opt].x, Riskfree,
+                        Volatility);
+    BlackScholesBodyGPU(callResult2, putResult2, d_StockPrice[opt].y,
+                        d_OptionStrike[opt].y, d_OptionYears[opt].y, Riskfree,
+                        Volatility);
+    d_CallResult[opt] = make_float2(callResult1, callResult2);
+    d_PutResult[opt] = make_float2(putResult1, putResult2);
+  }
+}
--- a/Samples/BlackScholes/BlackScholes_vs2017.sln
+++ b/Samples/BlackScholes/BlackScholes_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BlackScholes", "BlackScholes_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/BlackScholes/BlackScholes_vs2017.vcxproj
+++ b/Samples/BlackScholes/BlackScholes_vs2017.vcxproj
@ -0,0 +1,114 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>BlackScholes_vs2017</RootNamespace>
+    <ProjectName>BlackScholes</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 11.5.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/BlackScholes.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+      <AdditionalOptions>--threads 0</AdditionalOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="BlackScholes.cu" />
+    <ClCompile Include="BlackScholes_gold.cpp" />
+    <None Include="BlackScholes_kernel.cuh" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 11.5.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/BlackScholes/BlackScholes_vs2019.sln
+++ b/Samples/BlackScholes/BlackScholes_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BlackScholes", "BlackScholes_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/BlackScholes/BlackScholes_vs2019.vcxproj
+++ b/Samples/BlackScholes/BlackScholes_vs2019.vcxproj
@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>BlackScholes_vs2019</RootNamespace>
+    <ProjectName>BlackScholes</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 11.5.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/BlackScholes.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+      <AdditionalOptions>--threads 0</AdditionalOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="BlackScholes.cu" />
+    <ClCompile Include="BlackScholes_gold.cpp" />
+    <None Include="BlackScholes_kernel.cuh" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 11.5.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/BlackScholes/Makefile
+++ b/Samples/BlackScholes/Makefile
@ -0,0 +1,365 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 51000)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to 5.1.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is 5.1.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 35 37 50 52 60 61 70 72 75 80 86
+else
+SMS ?= 35 37 50 52 60 61 70 75 80 86
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += -maxrregcount=16 --threads 0 --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: BlackScholes
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+BlackScholes.o:BlackScholes.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+BlackScholes_gold.o:BlackScholes_gold.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+BlackScholes: BlackScholes.o BlackScholes_gold.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./BlackScholes
+
+clean:
+	rm -f BlackScholes BlackScholes.o BlackScholes_gold.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/BlackScholes
+
+clobber: clean
--- a/Samples/BlackScholes/NsightEclipse.xml
+++ b/Samples/BlackScholes/NsightEclipse.xml
@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>BlackScholes</name>
+  <cflags>
+    <flag>-maxrregcount=16</flag>
+  </cflags>
+  <description><![CDATA[This sample evaluates fair call and put prices for a given set of European options by Black-Scholes formula.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../Common</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Computational Finance</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>Computational Finance</keyword>
+    <keyword>option pricing</keyword>
+    <keyword>Black-Scholes</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>BlackScholes.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>3:Computational Finance</scope>
+  </scopes>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <sm-arch>sm86</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Black-Scholes Option Pricing</title>
+  <type>exe</type>
+  <whitepaper>doc\BlackScholes.pdf</whitepaper>
+</entry>
--- a/Samples/BlackScholes/README.md
+++ b/Samples/BlackScholes/README.md
@ -0,0 +1,67 @@
+# BlackScholes - Black-Scholes Option Pricing
+
+## Description
+
+This sample evaluates fair call and put prices for a given set of European options by Black-Scholes formula.
+
+## Key Concepts
+
+Computational Finance
+
+## Supported SM Architectures
+
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l
+
+## CUDA APIs involved
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
--- a/Samples/BlackScholes/doc/BlackScholes.doc
+++ b/Samples/BlackScholes/doc/BlackScholes.doc
--- a/Samples/BlackScholes/doc/BlackScholes.pdf
+++ b/Samples/BlackScholes/doc/BlackScholes.pdf
--- a/Samples/BlackScholes_nvrtc/.vscode/c_cpp_properties.json
+++ b/Samples/BlackScholes_nvrtc/.vscode/c_cpp_properties.json
@ -0,0 +1,18 @@
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/**",
+                "${workspaceFolder}/../../Common"
+            ],
+            "defines": [],
+            "compilerPath": "/usr/local/cuda/bin/nvcc",
+            "cStandard": "gnu17",
+            "cppStandard": "gnu++14",
+            "intelliSenseMode": "linux-gcc-x64",
+            "configurationProvider": "ms-vscode.makefile-tools"
+        }
+    ],
+    "version": 4
+}
--- a/Samples/BlackScholes_nvrtc/.vscode/extensions.json
+++ b/Samples/BlackScholes_nvrtc/.vscode/extensions.json
@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cpptools",
+        "ms-vscode.makefile-tools"
+    ]
+}
--- a/Samples/BlackScholes_nvrtc/.vscode/launch.json
+++ b/Samples/BlackScholes_nvrtc/.vscode/launch.json
@ -0,0 +1,10 @@
+{
+    "configurations": [
+        {
+            "name": "CUDA C++: Launch",
+            "type": "cuda-gdb",
+            "request": "launch",
+            "program": "${workspaceFolder}/BlackScholes_nvrtc"
+        }
+    ]
+}
--- a/Samples/BlackScholes_nvrtc/.vscode/tasks.json
+++ b/Samples/BlackScholes_nvrtc/.vscode/tasks.json
@ -0,0 +1,15 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "sample",
+            "type": "shell",
+            "command": "make dbg=1",
+            "problemMatcher": ["$nvcc"],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
--- a/Samples/BlackScholes_nvrtc/BlackScholes.cpp
+++ b/Samples/BlackScholes_nvrtc/BlackScholes.cpp
@ -0,0 +1,269 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample evaluates fair call and put prices for a
+ * given set of European options by Black-Scholes formula.
+ * See supplied whitepaper for more explanations.
+ */
+
+#include <cuda_runtime.h>
+#include <nvrtc_helper.h>
+
+#include <helper_functions.h>  // helper functions for string parsing
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of optN options on CPU
+////////////////////////////////////////////////////////////////////////////////
+
+extern "C" void BlackScholesCPU(float *h_CallResult, float *h_PutResult,
+                                float *h_StockPrice, float *h_OptionStrike,
+                                float *h_OptionYears, float Riskfree,
+                                float Volatility, int optN);
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of OptN options on GPU
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+// Helper function, returning uniformly distributed
+// random float in [low, high] range
+////////////////////////////////////////////////////////////////////////////////
+
+float RandFloat(float low, float high) {
+  float t = (float)rand() / (float)RAND_MAX;
+  return (1.0f - t) * low + t * high;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Data configuration
+////////////////////////////////////////////////////////////////////////////////
+
+const int OPT_N = 4000000;
+const int NUM_ITERATIONS = 512;
+const int OPT_SZ = OPT_N * sizeof(float);
+const float RISKFREE = 0.02f;
+const float VOLATILITY = 0.30f;
+
+#define DIV_UP(a, b) (((a) + (b)-1) / (b))
+
+////////////////////////////////////////////////////////////////////////////////
+// Main program
+////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char **argv) {
+  // Start logs
+  printf("[%s] - Starting...\n", argv[0]);
+
+  //'h_' prefix - CPU (host) memory space
+  float
+      // Results calculated by CPU for reference
+      *h_CallResultCPU,
+      *h_PutResultCPU,
+      // CPU copy of GPU results
+      *h_CallResultGPU, *h_PutResultGPU,
+      // CPU instance of input data
+      *h_StockPrice, *h_OptionStrike, *h_OptionYears;
+
+  //'d_' prefix - GPU (device) memory space
+  CUdeviceptr
+      // Results calculated by GPU
+      d_CallResult,
+      d_PutResult,
+
+      // GPU instance of input data
+      d_StockPrice, d_OptionStrike, d_OptionYears;
+
+  double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime;
+
+  StopWatchInterface *hTimer = NULL;
+  int i;
+
+  sdkCreateTimer(&hTimer);
+
+  printf("Initializing data...\n");
+  printf("...allocating CPU memory for options.\n");
+
+  h_CallResultCPU = (float *)malloc(OPT_SZ);
+  h_PutResultCPU = (float *)malloc(OPT_SZ);
+  h_CallResultGPU = (float *)malloc(OPT_SZ);
+  h_PutResultGPU = (float *)malloc(OPT_SZ);
+  h_StockPrice = (float *)malloc(OPT_SZ);
+  h_OptionStrike = (float *)malloc(OPT_SZ);
+  h_OptionYears = (float *)malloc(OPT_SZ);
+
+  char *cubin, *kernel_file;
+  size_t cubinSize;
+  kernel_file = sdkFindFilePath("BlackScholes_kernel.cuh", argv[0]);
+
+  // Compile the kernel BlackScholes_kernel.
+  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
+  CUmodule module = loadCUBIN(cubin, argc, argv);
+
+  CUfunction kernel_addr;
+  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "BlackScholesGPU"));
+
+  printf("...allocating GPU memory for options.\n");
+  checkCudaErrors(cuMemAlloc(&d_CallResult, OPT_SZ));
+  checkCudaErrors(cuMemAlloc(&d_PutResult, OPT_SZ));
+  checkCudaErrors(cuMemAlloc(&d_StockPrice, OPT_SZ));
+  checkCudaErrors(cuMemAlloc(&d_OptionStrike, OPT_SZ));
+  checkCudaErrors(cuMemAlloc(&d_OptionYears, OPT_SZ));
+
+  printf("...generating input data in CPU mem.\n");
+  srand(5347);
+
+  // Generate options set
+  for (i = 0; i < OPT_N; i++) {
+    h_CallResultCPU[i] = 0.0f;
+    h_PutResultCPU[i] = -1.0f;
+    h_StockPrice[i] = RandFloat(5.0f, 30.0f);
+    h_OptionStrike[i] = RandFloat(1.0f, 100.0f);
+    h_OptionYears[i] = RandFloat(0.25f, 10.0f);
+  }
+
+  printf("...copying input data to GPU mem.\n");
+  // Copy options data to GPU memory for further processing
+  checkCudaErrors(cuMemcpyHtoD(d_StockPrice, h_StockPrice, OPT_SZ));
+  checkCudaErrors(cuMemcpyHtoD(d_OptionStrike, h_OptionStrike, OPT_SZ));
+  checkCudaErrors(cuMemcpyHtoD(d_OptionYears, h_OptionYears, OPT_SZ));
+
+  printf("Data init done.\n\n");
+  printf("Executing Black-Scholes GPU kernel (%i iterations)...\n",
+         NUM_ITERATIONS);
+
+  sdkResetTimer(&hTimer);
+  sdkStartTimer(&hTimer);
+
+  dim3 cudaBlockSize(128, 1, 1);
+  dim3 cudaGridSize(DIV_UP(OPT_N / 2, 128), 1, 1);
+
+  float risk = RISKFREE;
+  float volatility = VOLATILITY;
+  int optval = OPT_N;
+
+  void *arr[] = {(void *)&d_CallResult,  (void *)&d_PutResult,
+                 (void *)&d_StockPrice,  (void *)&d_OptionStrike,
+                 (void *)&d_OptionYears, (void *)&risk,
+                 (void *)&volatility,    (void *)&optval};
+
+  for (i = 0; i < NUM_ITERATIONS; i++) {
+    checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
+                                   cudaGridSize.z, /* grid dim */
+                                   cudaBlockSize.x, cudaBlockSize.y,
+                                   cudaBlockSize.z, /* block dim */
+                                   0, 0,            /* shared mem, stream */
+                                   &arr[0],         /* arguments */
+                                   0));
+  }
+
+  checkCudaErrors(cuCtxSynchronize());
+
+  sdkStopTimer(&hTimer);
+  gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;
+
+  // Both call and put is calculated
+  printf("Options count             : %i     \n", 2 * OPT_N);
+  printf("BlackScholesGPU() time    : %f msec\n", gpuTime);
+  printf("Effective memory bandwidth: %f GB/s\n",
+         ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3));
+  printf("Gigaoptions per second    : %f     \n\n",
+         ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3));
+  printf(
+      "BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u "
+      "options, NumDevsUsed = %u, Workgroup = %u\n",
+      (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime * 1e-3,
+      (2 * OPT_N), 1, 128);
+
+  printf("\nReading back GPU results...\n");
+
+  // Read back GPU results to compare them to CPU results
+  checkCudaErrors(cuMemcpyDtoH(h_CallResultGPU, d_CallResult, OPT_SZ));
+  checkCudaErrors(cuMemcpyDtoH(h_PutResultGPU, d_PutResult, OPT_SZ));
+
+  printf("Checking the results...\n");
+  printf("...running CPU calculations.\n\n");
+
+  // Calculate options values on CPU
+  BlackScholesCPU(h_CallResultCPU, h_PutResultCPU, h_StockPrice, h_OptionStrike,
+                  h_OptionYears, RISKFREE, VOLATILITY, OPT_N);
+
+  printf("Comparing the results...\n");
+  // Calculate max absolute difference and L1 distance
+  // between CPU and GPU results
+  sum_delta = 0;
+  sum_ref = 0;
+  max_delta = 0;
+
+  for (i = 0; i < OPT_N; i++) {
+    ref = h_CallResultCPU[i];
+    delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]);
+
+    if (delta > max_delta) {
+      max_delta = delta;
+    }
+
+    sum_delta += delta;
+    sum_ref += fabs(ref);
+  }
+
+  L1norm = sum_delta / sum_ref;
+  printf("L1 norm: %E\n", L1norm);
+  printf("Max absolute error: %E\n\n", max_delta);
+
+  printf("Shutting down...\n");
+  printf("...releasing GPU memory.\n");
+
+  checkCudaErrors(cuMemFree(d_OptionYears));
+  checkCudaErrors(cuMemFree(d_OptionStrike));
+  checkCudaErrors(cuMemFree(d_StockPrice));
+  checkCudaErrors(cuMemFree(d_PutResult));
+  checkCudaErrors(cuMemFree(d_CallResult));
+
+  printf("...releasing CPU memory.\n");
+
+  free(h_OptionYears);
+  free(h_OptionStrike);
+  free(h_StockPrice);
+  free(h_PutResultGPU);
+  free(h_CallResultGPU);
+  free(h_PutResultCPU);
+  free(h_CallResultCPU);
+
+  sdkDeleteTimer(&hTimer);
+  printf("Shutdown done.\n");
+
+  printf("\n[%s] - Test Summary\n", argv[0]);
+
+  if (L1norm > 1e-6) {
+    printf("Test failed!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  printf("Test passed\n");
+  exit(EXIT_SUCCESS);
+}
--- a/Samples/BlackScholes_nvrtc/BlackScholes_gold.cpp
+++ b/Samples/BlackScholes_nvrtc/BlackScholes_gold.cpp
@ -0,0 +1,88 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+
+///////////////////////////////////////////////////////////////////////////////
+// Polynomial approximation of cumulative normal distribution function
+///////////////////////////////////////////////////////////////////////////////
+
+static double CND(double d) {
+  const double A1 = 0.31938153;
+  const double A2 = -0.356563782;
+  const double A3 = 1.781477937;
+  const double A4 = -1.821255978;
+  const double A5 = 1.330274429;
+  const double RSQRT2PI = 0.39894228040143267793994605993438;
+
+  double K = 1.0 / (1.0 + 0.2316419 * fabs(d));
+
+  double cnd = RSQRT2PI * exp(-0.5 * d * d) *
+               (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+
+  if (d > 0) cnd = 1.0 - cnd;
+
+  return cnd;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Black-Scholes formula for both call and put
+///////////////////////////////////////////////////////////////////////////////
+static void BlackScholesBodyCPU(float &callResult, float &putResult,
+                                float Sf,  // Stock price
+                                float Xf,  // Option strike
+                                float Tf,  // Option years
+                                float Rf,  // Riskless rate
+                                float Vf  // Volatility rate
+                                ) {
+  double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf;
+  double sqrtT = sqrt(T);
+  double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
+  double d2 = d1 - V * sqrtT;
+  double CNDD1 = CND(d1);
+  double CNDD2 = CND(d2);
+
+  // Calculate Call and Put simultaneously
+  double expRT = exp(-R * T);
+
+  callResult = (float)(S * CNDD1 - X * expRT * CNDD2);
+  putResult = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of optN options
+////////////////////////////////////////////////////////////////////////////////
+
+extern "C" void BlackScholesCPU(float *h_CallResult, float *h_PutResult,
+                                float *h_StockPrice, float *h_OptionStrike,
+                                float *h_OptionYears, float Riskfree,
+                                float Volatility, int optN) {
+  for (int opt = 0; opt < optN; opt++)
+    BlackScholesBodyCPU(h_CallResult[opt], h_PutResult[opt], h_StockPrice[opt],
+                        h_OptionStrike[opt], h_OptionYears[opt], Riskfree,
+                        Volatility);
+}
--- a/Samples/BlackScholes_nvrtc/BlackScholes_kernel.cuh
+++ b/Samples/BlackScholes_nvrtc/BlackScholes_kernel.cuh
@ -0,0 +1,103 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+// Polynomial approximation of cumulative normal distribution function
+///////////////////////////////////////////////////////////////////////////////
+
+__device__ inline float cndGPU(float d) {
+  const float A1 = 0.31938153f;
+  const float A2 = -0.356563782f;
+  const float A3 = 1.781477937f;
+  const float A4 = -1.821255978f;
+  const float A5 = 1.330274429f;
+  const float RSQRT2PI = 0.39894228040143267793994605993438f;
+
+  float K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));
+
+  float cnd = RSQRT2PI * __expf(-0.5f * d * d) *
+              (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+
+  if (d > 0) cnd = 1.0f - cnd;
+
+  return cnd;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Black-Scholes formula for both call and put
+///////////////////////////////////////////////////////////////////////////////
+__device__ inline void BlackScholesBodyGPU(float &CallResult, float &PutResult,
+                                           float S,  // Stock price
+                                           float X,  // Option strike
+                                           float T,  // Option years
+                                           float R,  // Riskless rate
+                                           float V  // Volatility rate
+                                           ) {
+  float sqrtT, expRT;
+  float d1, d2, CNDD1, CNDD2;
+
+  sqrtT = __fdividef(1.0F, rsqrtf(T));
+  d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
+  d2 = d1 - V * sqrtT;
+
+  CNDD1 = cndGPU(d1);
+  CNDD2 = cndGPU(d2);
+
+  // Calculate Call and Put simultaneously
+  expRT = __expf(-R * T);
+  CallResult = S * CNDD1 - X * expRT * CNDD2;
+  PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of optN options on GPU
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(128) __global__
+    void BlackScholesGPU(float2 *__restrict d_CallResult,
+                         float2 *__restrict d_PutResult,
+                         float2 *__restrict d_StockPrice,
+                         float2 *__restrict d_OptionStrike,
+                         float2 *__restrict d_OptionYears, float Riskfree,
+                         float Volatility, int optN) {
+  ////Thread index
+  const int opt = blockDim.x * blockIdx.x + threadIdx.x;
+
+  // Calculating 2 options per thread to increase ILP (instruction level
+  // parallelism)
+  if (opt < (optN / 2)) {
+    float callResult1, callResult2;
+    float putResult1, putResult2;
+    BlackScholesBodyGPU(callResult1, putResult1, d_StockPrice[opt].x,
+                        d_OptionStrike[opt].x, d_OptionYears[opt].x, Riskfree,
+                        Volatility);
+    BlackScholesBodyGPU(callResult2, putResult2, d_StockPrice[opt].y,
+                        d_OptionStrike[opt].y, d_OptionYears[opt].y, Riskfree,
+                        Volatility);
+    d_CallResult[opt] = make_float2(callResult1, callResult2);
+    d_PutResult[opt] = make_float2(putResult1, putResult2);
+  }
+}
--- a/Samples/BlackScholes_nvrtc/BlackScholes_nvrtc_vs2017.sln
+++ b/Samples/BlackScholes_nvrtc/BlackScholes_nvrtc_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BlackScholes_nvrtc", "BlackScholes_nvrtc_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/BlackScholes_nvrtc/BlackScholes_nvrtc_vs2017.vcxproj
+++ b/Samples/BlackScholes_nvrtc/BlackScholes_nvrtc_vs2017.vcxproj
@ -0,0 +1,114 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>BlackScholes_nvrtc_vs2017</RootNamespace>
+    <ProjectName>BlackScholes_nvrtc</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 11.5.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/BlackScholes_nvrtc.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration></CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+      <AdditionalOptions>--threads 0</AdditionalOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="BlackScholes.cpp" />
+    <ClCompile Include="BlackScholes_gold.cpp" />
+    <None Include="BlackScholes_kernel.cuh" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 11.5.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/BlackScholes_nvrtc/BlackScholes_nvrtc_vs2019.sln
+++ b/Samples/BlackScholes_nvrtc/BlackScholes_nvrtc_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BlackScholes_nvrtc", "BlackScholes_nvrtc_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/BlackScholes_nvrtc/BlackScholes_nvrtc_vs2019.vcxproj
+++ b/Samples/BlackScholes_nvrtc/BlackScholes_nvrtc_vs2019.vcxproj
@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>BlackScholes_nvrtc_vs2019</RootNamespace>
+    <ProjectName>BlackScholes_nvrtc</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 11.5.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/BlackScholes_nvrtc.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration></CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+      <AdditionalOptions>--threads 0</AdditionalOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="BlackScholes.cpp" />
+    <ClCompile Include="BlackScholes_gold.cpp" />
+    <None Include="BlackScholes_kernel.cuh" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 11.5.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/BlackScholes_nvrtc/Makefile
+++ b/Samples/BlackScholes_nvrtc/Makefile
@ -0,0 +1,422 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      CCFLAGS += -g
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - BlackScholes_nvrtc is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - BlackScholes_nvrtc is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# libNVRTC specific libraries
+ifeq ($(TARGET_OS),darwin)
+ LDFLAGS += -L$(CUDA_PATH)/lib -F/Library/Frameworks -framework CUDA
+endif
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 51000)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to 5.1.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is 5.1.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
+ifeq ($(TARGET_OS),darwin)
+  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
+else
+  ifeq ($(TARGET_ARCH),x86_64)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
+  endif
+
+  ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
+  endif
+
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
+  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
+  ifeq ("$(CUDALIB)","")
+    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
+    SAMPLE_ENABLED := 0
+  else
+    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
+    LIBRARIES += -L$(CUDALIB) -lcuda
+  endif
+endif
+
+ALL_CCFLAGS += --threads 0 --std=c++11
+
+INCLUDES += -I$(CUDA_PATH)/include
+
+LIBRARIES += -lnvrtc
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: BlackScholes_nvrtc
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+BlackScholes.o:BlackScholes.cpp
+	$(EXEC) $(HOST_COMPILER) $(INCLUDES) $(CCFLAGS) $(EXTRA_CCFLAGS) -o $@ -c $<
+
+BlackScholes_gold.o:BlackScholes_gold.cpp
+	$(EXEC) $(HOST_COMPILER) $(INCLUDES) $(CCFLAGS) $(EXTRA_CCFLAGS) -o $@ -c $<
+
+BlackScholes_nvrtc: BlackScholes.o BlackScholes_gold.o
+	$(EXEC) $(HOST_COMPILER) $(LDFLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./BlackScholes_nvrtc
+
+clean:
+	rm -f BlackScholes_nvrtc BlackScholes.o BlackScholes_gold.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/BlackScholes_nvrtc
+
+clobber: clean
--- a/Samples/BlackScholes_nvrtc/README.md
+++ b/Samples/BlackScholes_nvrtc/README.md
@ -0,0 +1,71 @@
+# BlackScholes_nvrtc - Black-Scholes Option Pricing with libNVRTC
+
+## Description
+
+This sample evaluates fair call and put prices for a given set of European options by Black-Scholes formula, compiling the CUDA kernels involved at runtime using NVRTC.
+
+## Key Concepts
+
+Computational Finance, Runtime Compilation
+
+## Supported SM Architectures
+
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, aarch64
+
+## CUDA APIs involved
+
+## Dependencies needed to build/run
+[NVRTC](../../README.md#nvrtc)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
--- a/Samples/EGLStream_CUDA_CrossGPU/.vscode/c_cpp_properties.json
+++ b/Samples/EGLStream_CUDA_CrossGPU/.vscode/c_cpp_properties.json
@ -0,0 +1,18 @@
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/**",
+                "${workspaceFolder}/../../Common"
+            ],
+            "defines": [],
+            "compilerPath": "/usr/local/cuda/bin/nvcc",
+            "cStandard": "gnu17",
+            "cppStandard": "gnu++14",
+            "intelliSenseMode": "linux-gcc-x64",
+            "configurationProvider": "ms-vscode.makefile-tools"
+        }
+    ],
+    "version": 4
+}
--- a/Samples/EGLStream_CUDA_CrossGPU/.vscode/extensions.json
+++ b/Samples/EGLStream_CUDA_CrossGPU/.vscode/extensions.json
@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cpptools",
+        "ms-vscode.makefile-tools"
+    ]
+}
--- a/Samples/EGLStream_CUDA_CrossGPU/.vscode/launch.json
+++ b/Samples/EGLStream_CUDA_CrossGPU/.vscode/launch.json
@ -0,0 +1,10 @@
+{
+    "configurations": [
+        {
+            "name": "CUDA C++: Launch",
+            "type": "cuda-gdb",
+            "request": "launch",
+            "program": "${workspaceFolder}/EGLStream_CUDA_CrossGPU"
+        }
+    ]
+}
--- a/Samples/EGLStream_CUDA_CrossGPU/.vscode/tasks.json
+++ b/Samples/EGLStream_CUDA_CrossGPU/.vscode/tasks.json
@ -0,0 +1,15 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "sample",
+            "type": "shell",
+            "command": "make dbg=1",
+            "problemMatcher": ["$nvcc"],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
--- a/Samples/EGLStream_CUDA_CrossGPU/Makefile
+++ b/Samples/EGLStream_CUDA_CrossGPU/Makefile
@ -0,0 +1,453 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - EGLStream_CUDA_CrossGPU is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on android
+ifeq ($(TARGET_OS),android)
+  $(info >>> WARNING - EGLStream_CUDA_CrossGPU is not supported on android - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Makefile include to help find EGL Libraries
+include ./findegl.mk
+
+# EGL specific libraries
+ifneq ($(TARGET_OS),darwin)
+ LIBRARIES += -lEGL
+endif
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 51000)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to 5.1.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is 5.1.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 35 37 50 52 60 61 70 72 75 80 86
+else
+SMS ?= 35 37 50 52 60 61 70 75 80 86
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ifeq ($(TARGET_OS),darwin)
+  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
+else
+  ifeq ($(TARGET_ARCH),x86_64)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
+  endif
+
+  ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
+  endif
+
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
+  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
+  ifeq ("$(CUDALIB)","")
+    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
+    SAMPLE_ENABLED := 0
+  else
+    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
+    LIBRARIES += -L$(CUDALIB) -lcuda
+  endif
+endif
+
+ALL_CCFLAGS += --threads 0 --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: EGLStream_CUDA_CrossGPU
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+cuda_consumer.o:cuda_consumer.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+cuda_producer.o:cuda_producer.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+eglstrm_common.o:eglstrm_common.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+kernel.o:kernel.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+main.o:main.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+EGLStream_CUDA_CrossGPU: cuda_consumer.o cuda_producer.o eglstrm_common.o kernel.o main.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./EGLStream_CUDA_CrossGPU
+
+clean:
+	rm -f EGLStream_CUDA_CrossGPU cuda_consumer.o cuda_producer.o eglstrm_common.o kernel.o main.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/EGLStream_CUDA_CrossGPU
+
+clobber: clean
--- a/Samples/EGLStream_CUDA_CrossGPU/NsightEclipse.xml
+++ b/Samples/EGLStream_CUDA_CrossGPU/NsightEclipse.xml
@ -0,0 +1,78 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>EGLStream_CUDA_CrossGPU</name>
+  <cuda_api_list>
+    <driver>cuDeviceGet</driver>
+    <driver>cuDeviceGetAttribute</driver>
+    <driver>cuDeviceComputeCapability</driver>
+    <driver>cuDeviceGetCount</driver>
+    <driver>cuDeviceGetName</driver>
+    <driver>cuGraphicsResourceGetMappedEglFrame</driver>
+    <driver>cuEGLStreamConsumerAcquireFrame</driver>
+    <driver>cuEGLStreamConsumerReleaseFrame</driver>
+    <driver>cuEGLStreamProducerReturnFrame</driver>
+    <driver>cuEGLStreamProducerPresentFrame</driver>
+    <driver>cuCtxCreate</driver>
+    <driver>cuMemAlloc</driver>
+    <driver>cuMemFree</driver>
+    <driver>cuMemcpy3D</driver>
+    <driver>cuStreamCreate</driver>
+    <driver>cuCtxPushCurrent</driver>
+    <driver>cuCtxPopCurrent</driver>
+    <driver>cuCtxDestroy</driver>
+  </cuda_api_list>
+  <description><![CDATA[Demonstrates CUDA and EGL Streams interop, where consumer's EGL Stream is on one GPU and producer's on other and both consumer-producer are different processes.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../Common</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">EGLStreams Interop</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>EGL Streams</keyword>
+  </keywords>
+  <libraries>
+    <library os="linux">cuda</library>
+    <library framework="true" os="macosx">CUDA</library>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>main.cpp</primary_file>
+  <required_dependencies>
+    <dependency>EGL</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Advanced Topics</scope>
+    <scope>2:Graphics Interop</scope>
+  </scopes>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <sm-arch>sm86</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>EGLStream_CUDA_CrossGPU</title>
+  <type>exe</type>
+</entry>
--- a/Samples/EGLStream_CUDA_CrossGPU/README.md
+++ b/Samples/EGLStream_CUDA_CrossGPU/README.md
@ -0,0 +1,64 @@
+# EGLStream_CUDA_CrossGPU - EGLStream_CUDA_CrossGPU
+
+## Description
+
+Demonstrates CUDA and EGL Streams interop, where consumer's EGL Stream is on one GPU and producer's on other and both consumer-producer are different processes.
+
+## Key Concepts
+
+EGLStreams Interop
+
+## Supported SM Architectures
+
+[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux
+
+## Supported CPU Architecture
+
+x86_64, armv7l
+
+## CUDA APIs involved
+
+### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html)
+cuDeviceGet, cuDeviceGetAttribute, cuDeviceComputeCapability, cuDeviceGetCount, cuDeviceGetName, cuGraphicsResourceGetMappedEglFrame, cuEGLStreamConsumerAcquireFrame, cuEGLStreamConsumerReleaseFrame, cuEGLStreamProducerReturnFrame, cuEGLStreamProducerPresentFrame, cuCtxCreate, cuMemAlloc, cuMemFree, cuMemcpy3D, cuStreamCreate, cuCtxPushCurrent, cuCtxPopCurrent, cuCtxDestroy
+
+## Dependencies needed to build/run
+[EGL](../../README.md#egl)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 11.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
--- a/Samples/EGLStream_CUDA_CrossGPU/cuda_consumer.cpp
+++ b/Samples/EGLStream_CUDA_CrossGPU/cuda_consumer.cpp
@ -0,0 +1,258 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   Simple CUDA consumer rendering sample app
+//
+
+#include <cuda_runtime.h>
+#include "cuda_consumer.h"
+#include "eglstrm_common.h"
+#include <math.h>
+#include <unistd.h>
+
+#if defined(EXTENSION_LIST)
+EXTENSION_LIST(EXTLST_EXTERN)
+#endif
+CUgraphicsResource cudaResource;
+
+static int count_acq = 0;
+static double acquire_time[25000] = {0}, total_time_acq = 0;
+
+static int count_rel = 0;
+static double rel_time[25000] = {0}, total_time_rel = 0;
+
+void acquireApiStat(void);
+void acquireApiStat(void) {
+  int i = 0;
+  double min = 10000000, max = 0;
+  double average_launch_time = 0, standard_deviation = 0;
+  if (count_acq == 0) return;
+  // lets compute the standard deviation
+  min = max = acquire_time[1];
+  average_launch_time = (total_time_acq - acquire_time[0]) / count_acq;
+  for (i = 1; i < count_acq; i++) {
+    standard_deviation += (acquire_time[i] - average_launch_time) *
+                          (acquire_time[i] - average_launch_time);
+    if (acquire_time[i] < min) min = acquire_time[i];
+    if (acquire_time[i] > max) max = acquire_time[i];
+  }
+  standard_deviation = sqrt(standard_deviation / count_acq);
+  printf("acquire Avg: %lf\n", average_launch_time);
+  printf("acquire  SD: %lf\n", standard_deviation);
+  printf("acquire min: %lf\n", min);
+  printf("acquire max: %lf\n", max);
+
+  min = max = rel_time[1];
+  average_launch_time = (total_time_rel - rel_time[0]) / count_rel;
+  for (i = 1; i < count_rel; i++) {
+    standard_deviation += (rel_time[i] - average_launch_time) *
+                          (rel_time[i] - average_launch_time);
+    if (rel_time[i] < min) min = rel_time[i];
+    if (rel_time[i] > max) max = rel_time[i];
+  }
+  standard_deviation = sqrt(standard_deviation / count_rel);
+  printf("release Avg: %lf\n", average_launch_time);
+  printf("release  SD: %lf\n", standard_deviation);
+  printf("release min: %lf\n", min);
+  printf("release max: %lf\n", max);
+}
+CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer,
+                                  int frameNumber) {
+  CUresult cuStatus = CUDA_SUCCESS;
+  CUeglFrame cudaEgl;
+  struct timespec start, end;
+  EGLint streamState = 0;
+  double curTime;
+
+  if (!cudaConsumer) {
+    printf("%s: Bad parameter\n", __func__);
+    goto done;
+  }
+
+  while (1) {
+    if (!eglQueryStreamKHR(cudaConsumer->eglDisplay, cudaConsumer->eglStream,
+                           EGL_STREAM_STATE_KHR, &streamState)) {
+      printf("Cuda Consumer: eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
+      cuStatus = CUDA_ERROR_UNKNOWN;
+      goto done;
+    }
+    if (streamState == EGL_STREAM_STATE_DISCONNECTED_KHR) {
+      printf("Cuda Consumer: EGL_STREAM_STATE_DISCONNECTED_KHR received\n");
+      cuStatus = CUDA_ERROR_UNKNOWN;
+      goto done;
+    }
+
+    if (streamState == EGL_STREAM_STATE_NEW_FRAME_AVAILABLE_KHR) {
+      break;
+    }
+  }
+  if (cudaConsumer->profileAPI) {
+    getTime(&start);
+  }
+  cuStatus =
+      cuEGLStreamConsumerAcquireFrame(&(cudaConsumer->cudaConn), &cudaResource,
+                                      &cudaConsumer->consCudaStream, 16000);
+  if (cudaConsumer->profileAPI) {
+    getTime(&end);
+    curTime = TIME_DIFF(end, start);
+    acquire_time[count_acq++] = curTime;
+    if (count_acq == 25000) count_acq = 0;
+    total_time_acq += curTime;
+  }
+  if (cuStatus == CUDA_SUCCESS) {
+    CUdeviceptr pDevPtr = 0;
+    cudaError_t err;
+
+    cuStatus =
+        cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0);
+    if (cuStatus != CUDA_SUCCESS) {
+      printf("Cuda get resource failed with %d\n", cuStatus);
+      goto done;
+    }
+    pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0];
+
+    err = cudaConsumer_filter(cudaConsumer->consCudaStream, (char *)pDevPtr,
+                              WIDTH * 4, HEIGHT, PROD_DATA + frameNumber,
+                              CONS_DATA + frameNumber, frameNumber);
+    if (err != cudaSuccess) {
+      printf("Cuda Consumer: kernel failed with: %s\n",
+             cudaGetErrorString(err));
+      goto done;
+    }
+  }
+
+done:
+  return cuStatus;
+}
+
+CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *cudaConsumer,
+                                  int frameNumber) {
+  CUresult cuStatus = CUDA_SUCCESS;
+  struct timespec start, end;
+  double curTime;
+
+  if (!cudaConsumer) {
+    printf("%s: Bad parameter\n", __func__);
+    goto done;
+  }
+  if (cudaConsumer->profileAPI) {
+    getTime(&start);
+  }
+  cuStatus = cuEGLStreamConsumerReleaseFrame(
+      &cudaConsumer->cudaConn, cudaResource, &cudaConsumer->consCudaStream);
+  if (cudaConsumer->profileAPI) {
+    getTime(&end);
+    curTime = TIME_DIFF(end, start);
+    rel_time[count_rel++] = curTime;
+    if (count_rel == 25000) count_rel = 0;
+    total_time_rel += curTime;
+  }
+  if (cuStatus != CUDA_SUCCESS) {
+    printf("cuEGLStreamConsumerReleaseFrame failed, status:%d\n", cuStatus);
+    goto done;
+  }
+
+done:
+  return cuStatus;
+}
+
+CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) {
+  CUdevice device;
+  CUresult status = CUDA_SUCCESS;
+
+  if (CUDA_SUCCESS != (status = cuInit(0))) {
+    printf("Failed to initialize CUDA\n");
+    return status;
+  }
+
+  if (CUDA_SUCCESS !=
+      (status = cuDeviceGet(&device, cudaConsumer->cudaDevId))) {
+    printf("failed to get CUDA device\n");
+    return status;
+  }
+
+  if (CUDA_SUCCESS !=
+      (status = cuCtxCreate(&cudaConsumer->context, 0, device))) {
+    printf("failed to create CUDA context\n");
+    return status;
+  }
+
+  int major = 0, minor = 0;
+  char deviceName[256];
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                       device);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                       device);
+  cuDeviceGetName(deviceName, 256, device);
+  printf(
+      "CUDA Consumer on GPU Device %d: \"%s\" with compute capability "
+      "%d.%d\n\n",
+      device, deviceName, major, minor);
+
+  cuCtxPopCurrent(&cudaConsumer->context);
+  if (major < 6) {
+    printf(
+        "EGLStream_CUDA_CrossGPU requires SM 6.0 or higher arch GPU.  "
+        "Exiting...\n");
+    exit(2);  // EXIT_WAIVED
+  }
+
+  return status;
+}
+
+CUresult cuda_consumer_init(test_cuda_consumer_s *cudaConsumer,
+                            TestArgs *args) {
+  CUresult status = CUDA_SUCCESS;
+  int bufferSize;
+
+  cudaConsumer->charCnt = args->charCnt;
+  bufferSize = args->charCnt;
+
+  cudaConsumer->pCudaCopyMem = (unsigned char *)malloc(bufferSize);
+  if (cudaConsumer->pCudaCopyMem == NULL) {
+    printf("Cuda Consumer: malloc failed\n");
+    goto done;
+  }
+
+  status = cuStreamCreate(&cudaConsumer->consCudaStream, 0);
+  if (status != CUDA_SUCCESS) {
+    printf("Cuda Consumer: cuStreamCreate failed, status:%d\n", status);
+    goto done;
+  }
+
+  atexit(acquireApiStat);
+done:
+  return status;
+}
+
+CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer) {
+  if (cudaConsumer->pCudaCopyMem) {
+    free(cudaConsumer->pCudaCopyMem);
+  }
+  return cuEGLStreamConsumerDisconnect(&cudaConsumer->cudaConn);
+}
--- a/Samples/EGLStream_CUDA_CrossGPU/cuda_consumer.h
+++ b/Samples/EGLStream_CUDA_CrossGPU/cuda_consumer.h
@ -0,0 +1,66 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   CUDA consumer header file
+//
+
+#ifndef _CUDA_CONSUMER_H_
+#define _CUDA_CONSUMER_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "cudaEGL.h"
+#include "eglstrm_common.h"
+#include <cuda_runtime.h>
+#include <cuda.h>
+
+typedef struct _test_cuda_consumer_s {
+  CUcontext context;
+  CUeglStreamConnection cudaConn;
+  int cudaDevId;
+  EGLDisplay eglDisplay;
+  EGLStreamKHR eglStream;
+  unsigned int charCnt;
+  char *cudaBuf;
+  bool profileAPI;
+  unsigned char *pCudaCopyMem;
+  CUstream consCudaStream;
+} test_cuda_consumer_s;
+
+CUresult cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args);
+CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer);
+CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *data, int frameNumber);
+CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *data, int frameNumber);
+CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer);
+cudaError_t cudaConsumer_filter(CUstream cStream, char *pSrc, int width,
+                                int height, char expectedVal, char newVal,
+                                int frameNumber);
+cudaError_t cudaGetValueMismatch(void);
+
+#endif
--- a/Samples/EGLStream_CUDA_CrossGPU/cuda_producer.cpp
+++ b/Samples/EGLStream_CUDA_CrossGPU/cuda_producer.cpp
@ -0,0 +1,288 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   Simple cuda EGL stream producer app
+//
+
+#include "cudaEGL.h"
+#include "cuda_producer.h"
+#include "eglstrm_common.h"
+#include <cuda_runtime.h>
+#if defined(EXTENSION_LIST)
+EXTENSION_LIST(EXTLST_EXTERN)
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string.h>
+#include "cuda_runtime.h"
+#include "math.h"
+
+int cudaPresentReturnData = INIT_DATA;
+int fakePresent = 0;
+CUeglFrame fakeFrame;
+CUdeviceptr cudaPtrFake;
+extern bool isCrossDevice;
+
+void cudaProducerPrepareFrame(CUeglFrame *cudaEgl, CUdeviceptr cudaPtr,
+                              int bufferSize) {
+  cudaEgl->frame.pPitch[0] = (void *)cudaPtr;
+  cudaEgl->width = WIDTH;
+  cudaEgl->depth = 0;
+  cudaEgl->height = HEIGHT;
+  cudaEgl->pitch = WIDTH * 4;
+  cudaEgl->frameType = CU_EGL_FRAME_TYPE_PITCH;
+  cudaEgl->planeCount = 1;
+  cudaEgl->numChannels = 4;
+  cudaEgl->eglColorFormat = CU_EGL_COLOR_FORMAT_ARGB;
+  cudaEgl->cuFormat = CU_AD_FORMAT_UNSIGNED_INT8;
+}
+
+static int count_present = 0, count_return = 0;
+static double present_time[25000] = {0}, total_time_present = 0;
+static double return_time[25000] = {0}, total_time_return = 0;
+
+void presentApiStat(void);
+void presentApiStat(void) {
+  int i = 0;
+  double min = 10000000, max = 0;
+  double average_launch_time = 0, standard_deviation = 0;
+  if (count_present == 0) return;
+  // lets compute the standard deviation
+  min = max = present_time[1];
+  average_launch_time = (total_time_present) / count_present;
+  for (i = 1; i < count_present; i++) {
+    standard_deviation += (present_time[i] - average_launch_time) *
+                          (present_time[i] - average_launch_time);
+    if (present_time[i] < min) min = present_time[i];
+    if (present_time[i] > max) max = present_time[i];
+  }
+  standard_deviation = sqrt(standard_deviation / count_present);
+  printf("present Avg: %lf\n", average_launch_time);
+  printf("present  SD: %lf\n", standard_deviation);
+  printf("present min: %lf\n", min);
+  printf("present max: %lf\n", max);
+
+  min = max = return_time[1];
+  average_launch_time = (total_time_return - return_time[0]) / count_return;
+  for (i = 1; i < count_return; i++) {
+    standard_deviation += (return_time[i] - average_launch_time) *
+                          (return_time[i] - average_launch_time);
+    if (return_time[i] < min) min = return_time[i];
+    if (return_time[i] > max) max = return_time[i];
+  }
+  standard_deviation = sqrt(standard_deviation / count_return);
+  printf("return  Avg: %lf\n", average_launch_time);
+  printf("return   SD: %lf\n", standard_deviation);
+  printf("return  min: %lf\n", min);
+  printf("return  max: %lf\n", max);
+}
+CUresult cudaProducerPresentFrame(test_cuda_producer_s *cudaProducer,
+                                  CUeglFrame cudaEgl, int t) {
+  static int flag = 0;
+  CUresult status = CUDA_SUCCESS;
+  struct timespec start, end;
+  double curTime;
+  CUdeviceptr pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0];
+  cudaProducer_filter(cudaProducer->prodCudaStream, (char *)pDevPtr, WIDTH * 4,
+                      HEIGHT, cudaPresentReturnData, PROD_DATA + t, t);
+  if (cudaProducer->profileAPI) {
+    getTime(&start);
+  }
+  status = cuEGLStreamProducerPresentFrame(&cudaProducer->cudaConn, cudaEgl,
+                                           &cudaProducer->prodCudaStream);
+  if (status != CUDA_SUCCESS) {
+    printf("Cuda Producer: Present frame failed, status:%d\n", status);
+    goto done;
+  }
+  flag++;
+  if (cudaProducer->profileAPI && flag > 10) {
+    getTime(&end);
+    curTime = TIME_DIFF(end, start);
+    present_time[count_present++] = curTime;
+    if (count_present == 25000) count_present = 0;
+    total_time_present += curTime;
+  }
+done:
+  return status;
+}
+
+int flag = 0;
+CUresult cudaProducerReturnFrame(test_cuda_producer_s *cudaProducer,
+                                 CUeglFrame cudaEgl, int t) {
+  CUresult status = CUDA_SUCCESS;
+  struct timespec start, end;
+  double curTime;
+  CUdeviceptr pDevPtr = 0;
+
+  pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0];
+  if (cudaProducer->profileAPI) {
+    getTime(&start);
+  }
+
+  while (1) {
+    status = cuEGLStreamProducerReturnFrame(&cudaProducer->cudaConn, &cudaEgl,
+                                            &cudaProducer->prodCudaStream);
+    if (status == CUDA_ERROR_LAUNCH_TIMEOUT) {
+      continue;
+    } else if (status != CUDA_SUCCESS) {
+      printf("Cuda Producer: Return frame failed, status:%d\n", status);
+      goto done;
+    }
+    break;
+  }
+  if (cudaProducer->profileAPI) {
+    getTime(&end);
+    curTime = TIME_DIFF(end, start);
+    return_time[count_return++] = curTime;
+    if (count_return == 25000) count_return = 0;
+    total_time_return += curTime;
+  }
+  if (flag % 2 == 0) {
+    cudaPresentReturnData++;
+  }
+  cudaProducer_filter(cudaProducer->prodCudaStream, (char *)pDevPtr, WIDTH * 4,
+                      HEIGHT, CONS_DATA + t, cudaPresentReturnData, t);
+  flag++;
+done:
+  return status;
+}
+
+CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer) {
+  CUdevice device;
+  CUresult status = CUDA_SUCCESS;
+
+  if (CUDA_SUCCESS != (status = cuInit(0))) {
+    printf("Failed to initialize CUDA\n");
+    return status;
+  }
+
+  if (CUDA_SUCCESS !=
+      (status = cuDeviceGet(&device, cudaProducer->cudaDevId))) {
+    printf("failed to get CUDA device\n");
+    return status;
+  }
+
+  if (CUDA_SUCCESS !=
+      (status = cuCtxCreate(&cudaProducer->context, 0, device))) {
+    printf("failed to create CUDA context\n");
+    return status;
+  }
+
+  int major = 0, minor = 0;
+  char deviceName[256];
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                       device);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                       device);
+  cuDeviceGetName(deviceName, 256, device);
+  printf(
+      "CUDA Producer on GPU Device %d: \"%s\" with compute capability "
+      "%d.%d\n\n",
+      device, deviceName, major, minor);
+
+  cuCtxPopCurrent(&cudaProducer->context);
+
+  if (major < 6) {
+    printf(
+        "EGLStream_CUDA_CrossGPU requires SM 6.0 or higher arch GPU.  "
+        "Exiting...\n");
+    exit(2);  // EXIT_WAIVED
+  }
+
+  return status;
+}
+
+CUresult cudaProducerInit(test_cuda_producer_s *cudaProducer, TestArgs *args) {
+  CUresult status = CUDA_SUCCESS;
+  int bufferSize;
+
+  cudaProducer->charCnt = args->charCnt;
+  bufferSize = cudaProducer->charCnt;
+
+  cudaProducer->tempBuff = (char *)malloc(bufferSize);
+  if (!cudaProducer->tempBuff) {
+    printf("Cuda Producer: Failed to allocate image buffer\n");
+    status = CUDA_ERROR_UNKNOWN;
+    goto done;
+  }
+  memset((void *)cudaProducer->tempBuff, INIT_DATA, cudaProducer->charCnt);
+
+  // Fill this init data
+  status = cuMemAlloc(&cudaProducer->cudaPtr, bufferSize);
+  if (status != CUDA_SUCCESS) {
+    printf("Cuda Producer: cuda Malloc failed, status:%d\n", status);
+    goto done;
+  }
+  status = cuMemcpyHtoD(cudaProducer->cudaPtr, (void *)(cudaProducer->tempBuff),
+                        bufferSize);
+  if (status != CUDA_SUCCESS) {
+    printf("Cuda Producer: cuMemCpy failed, status:%d\n", status);
+    goto done;
+  }
+
+  // Fill this init data
+  status = cuMemAlloc(&cudaProducer->cudaPtr1, bufferSize);
+  if (status != CUDA_SUCCESS) {
+    printf("Cuda Producer: cuda Malloc failed, status:%d\n", status);
+    goto done;
+  }
+  status = cuMemcpyHtoD(cudaProducer->cudaPtr1,
+                        (void *)(cudaProducer->tempBuff), bufferSize);
+  if (status != CUDA_SUCCESS) {
+    printf("Cuda Producer: cuMemCpy failed, status:%d\n", status);
+    goto done;
+  }
+
+  status = cuStreamCreate(&cudaProducer->prodCudaStream, 0);
+  if (status != CUDA_SUCCESS) {
+    printf("Cuda Producer: cuStreamCreate failed, status:%d\n", status);
+    goto done;
+  }
+
+  // Fill this init data
+  status = cuMemAlloc(&cudaPtrFake, 100);
+  if (status != CUDA_SUCCESS) {
+    printf("Cuda Producer: cuda Malloc failed, status:%d\n", status);
+    goto done;
+  }
+
+  atexit(presentApiStat);
+done:
+  return status;
+}
+
+CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer) {
+  if (cudaProducer->tempBuff) {
+    free(cudaProducer->tempBuff);
+  }
+  if (cudaProducer->cudaPtr) {
+    cuMemFree(cudaProducer->cudaPtr);
+  }
+  return cuEGLStreamProducerDisconnect(&cudaProducer->cudaConn);
+}
--- a/Samples/EGLStream_CUDA_CrossGPU/cuda_producer.h
+++ b/Samples/EGLStream_CUDA_CrossGPU/cuda_producer.h
@ -0,0 +1,68 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   Simple cuda producer header file
+//
+
+#ifndef _CUDA_PRODUCER_H_
+#define _CUDA_PRODUCER_H_
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include "cudaEGL.h"
+#include "eglstrm_common.h"
+#include <cuda_runtime.h>
+#include <cuda.h>
+
+typedef struct _test_cuda_producer_s {
+  //  Stream params
+  CUcontext context;
+  CUeglStreamConnection cudaConn;
+  int cudaDevId;
+  EGLStreamKHR eglStream;
+  EGLDisplay eglDisplay;
+  unsigned int charCnt;
+  bool profileAPI;
+  char *tempBuff;
+  CUdeviceptr cudaPtr;
+  CUdeviceptr cudaPtr1;
+  CUstream prodCudaStream;
+} test_cuda_producer_s;
+
+CUresult cudaProducerInit(test_cuda_producer_s *cudaProducer, TestArgs *args);
+CUresult cudaProducerPresentFrame(test_cuda_producer_s *parserArg,
+                                  CUeglFrame cudaEgl, int t);
+CUresult cudaProducerReturnFrame(test_cuda_producer_s *parserArg,
+                                 CUeglFrame cudaEgl, int t);
+CUresult cudaProducerDeinit(test_cuda_producer_s *cudaProducer);
+CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer);
+cudaError_t cudaProducer_filter(CUstream cStream, char *pSrc, int width,
+                                int height, char expectedVal, char newVal,
+                                int frameNumber);
+void cudaProducerPrepareFrame(CUeglFrame *cudaEgl, CUdeviceptr cudaPtr,
+                              int bufferSize);
+#endif
--- a/Samples/EGLStream_CUDA_CrossGPU/eglstrm_common.cpp
+++ b/Samples/EGLStream_CUDA_CrossGPU/eglstrm_common.cpp
@ -0,0 +1,423 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   Common egl stream functions
+//
+
+#include "eglstrm_common.h"
+
+EGLStreamKHR g_producerEglStream = EGL_NO_STREAM_KHR;
+EGLStreamKHR g_consumerEglStream = EGL_NO_STREAM_KHR;
+EGLDisplay g_producerEglDisplay = EGL_NO_DISPLAY;
+EGLDisplay g_consumerEglDisplay = EGL_NO_DISPLAY;
+int cudaDevIndexProd = -1;
+int cudaDevIndexCons = -1;
+
+#if defined(EXTENSION_LIST)
+EXTENSION_LIST(EXTLST_DECL)
+typedef void (*extlst_fnptr_t)(void);
+static struct {
+  extlst_fnptr_t *fnptr;
+  char const *name;
+  bool is_dgpu;  // This function is need only for dgpu case
+} extensionList[] = {EXTENSION_LIST(EXTLST_ENTRY)};
+
+int eglSetupExtensions(bool isCrossDevice) {
+  unsigned int i;
+
+  for (i = 0; i < (sizeof(extensionList) / sizeof(*extensionList)); i++) {
+    // load the dgpu function only if we are running cross device test
+    if ((!extensionList[i].is_dgpu) ||
+        (extensionList[i].is_dgpu == isCrossDevice)) {
+      *extensionList[i].fnptr = eglGetProcAddress(extensionList[i].name);
+      if (*extensionList[i].fnptr == NULL) {
+        printf("Couldn't get address of %s()\n", extensionList[i].name);
+        return 0;
+      }
+    }
+  }
+
+  return 1;
+}
+
+int EGLStreamInit(bool isCrossDevice, int isConsumer,
+                  EGLNativeFileDescriptorKHR fileDesc) {
+  static const EGLint streamAttrFIFOMode[] = {
+      EGL_STREAM_FIFO_LENGTH_KHR, 5, EGL_SUPPORT_REUSE_NV, EGL_FALSE, EGL_NONE};
+  EGLDisplay eglDisplay[2] = {0};
+  EGLStreamKHR eglStream[2] = {0};
+  EGLBoolean eglStatus;
+
+#define MAX_EGL_DEVICES 4
+
+  EGLDeviceEXT devices[MAX_EGL_DEVICES];
+  EGLint numDevices = 0;
+
+  eglStatus = eglQueryDevicesEXT(MAX_EGL_DEVICES, devices, &numDevices);
+  if (eglStatus != EGL_TRUE) {
+    printf("Error querying EGL devices\n");
+    goto Done;
+  }
+
+  if (numDevices == 0) {
+    printf("No EGL devices found\n");
+    eglStatus = EGL_FALSE;
+    goto Done;
+  }
+
+  // If cross device, create discrete GPU stream first and then create the
+  // integrated GPU stream to connect to it via fd. The other way round fails
+  // in producer connect.
+  //
+  // TODO: Find out if this EGL behavior is by design.
+  if (isConsumer) {
+    int egl_device_id = 0;
+    for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) {
+      EGLAttrib cuda_device;
+      eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id],
+                                          EGL_CUDA_DEVICE_NV, &cuda_device);
+      if (eglStatus == EGL_TRUE) {
+        cudaDevIndexCons = cuda_device;  // We select first EGL-CUDA Capable
+                                         // device for consumer.
+        printf(
+            "Found EGL-CUDA Capable device with CUDA Device id = %d out of "
+            "egl_device_id = %d\n",
+            (int)cudaDevIndexCons, egl_device_id);
+        break;
+      }
+    }
+
+    if (egl_device_id >= numDevices) {
+      printf("No CUDA Capable EGL Device found.. Waiving execution\n");
+      goto Done;
+    }
+
+    g_consumerEglDisplay = eglGetPlatformDisplayEXT(
+        EGL_PLATFORM_DEVICE_EXT, (void *)devices[egl_device_id], NULL);
+    if (g_consumerEglDisplay == EGL_NO_DISPLAY) {
+      printf("Could not get EGL display from device. \n");
+      eglStatus = EGL_FALSE;
+      goto Done;
+    }
+
+    eglStatus = eglInitialize(g_consumerEglDisplay, 0, 0);
+    if (!eglStatus) {
+      printf("EGL failed to initialize. \n");
+      eglStatus = EGL_FALSE;
+      goto Done;
+    }
+
+    g_consumerEglStream =
+        eglCreateStreamKHR(g_consumerEglDisplay, streamAttrFIFOMode);
+    if (g_consumerEglStream == EGL_NO_STREAM_KHR) {
+      printf("Could not create EGL stream.\n");
+      eglStatus = EGL_FALSE;
+      goto Done;
+    }
+
+    eglStatus = eglStreamAttribKHR(g_consumerEglDisplay, g_consumerEglStream,
+                                   EGL_CONSUMER_LATENCY_USEC_KHR, 16000);
+    if (eglStatus != EGL_TRUE) {
+      printf("eglStreamAttribKHR EGL_CONSUMER_LATENCY_USEC_KHR failed\n");
+      goto Done;
+    }
+
+    eglStatus =
+        eglStreamAttribKHR(g_consumerEglDisplay, g_consumerEglStream,
+                           EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR, 16000);
+    if (eglStatus != EGL_TRUE) {
+      printf(
+          "eglStreamAttribKHR EGL_CONSUMER_ACQUIRE_TIMEOUT_USEC_KHR failed\n");
+      goto Done;
+    }
+  }
+
+  if (!isConsumer) {  // Producer
+
+    if (fileDesc == EGL_NO_FILE_DESCRIPTOR_KHR) {
+      printf("Cuda Producer received bad file descriptor\n");
+      eglStatus = EGL_FALSE;
+      goto Done;
+    }
+
+    int egl_device_id = 0;
+    int egl_cuda_devices = 0;
+    for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) {
+      EGLAttrib cuda_device = -1;
+      eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id],
+                                          EGL_CUDA_DEVICE_NV, &cuda_device);
+      if (eglStatus == EGL_TRUE) {
+        egl_cuda_devices++;
+        if (isCrossDevice && (egl_cuda_devices > 1)) {
+          // We select second EGL-CUDA Capable device for producer.
+          cudaDevIndexProd = (int)cuda_device;
+          printf(
+              "Found EGL-CUDA Capable device with CUDA Device id = %d "
+              "egl_device_id = %d \n",
+              (int)cudaDevIndexProd, egl_device_id);
+          break;
+        }
+        if (!isCrossDevice) {
+          // We select first EGL-CUDA Capable device for producer same as
+          // consumer.
+          cudaDevIndexProd = (int)cuda_device;
+          printf(
+              "Found EGL-CUDA Capable device with CUDA Device id = %d "
+              "egl_device_id = %d \n",
+              (int)cudaDevIndexProd, egl_device_id);
+          break;
+        }
+      }
+    }
+
+    if (egl_device_id >= numDevices) {
+      printf("No CUDA Capable EGL Device found.. Waiving execution\n");
+      goto Done;
+    }
+
+    g_producerEglDisplay = eglGetPlatformDisplayEXT(
+        EGL_PLATFORM_DEVICE_EXT, (void *)devices[egl_device_id], NULL);
+    if (g_producerEglDisplay == EGL_NO_DISPLAY) {
+      printf("Could not get EGL display from device. \n");
+      eglStatus = EGL_FALSE;
+      goto Done;
+    }
+
+    eglStatus = eglInitialize(g_producerEglDisplay, 0, 0);
+    if (!eglStatus) {
+      printf("EGL failed to initialize. \n");
+      eglStatus = EGL_FALSE;
+      goto Done;
+    }
+
+    g_producerEglStream =
+        eglCreateStreamFromFileDescriptorKHR(g_producerEglDisplay, fileDesc);
+    close(fileDesc);
+
+    if (g_producerEglStream == EGL_NO_STREAM_KHR) {
+      printf("CUDA Producer Could not create EGL stream.\n");
+      eglStatus = EGL_FALSE;
+      goto Done;
+    } else {
+      printf("Producer created EGLStream for the GPU.\n");
+    }
+  }
+
+Done:
+  return eglStatus == EGL_TRUE ? 1 : 0;
+}
+
+void EGLStreamFini(void) {
+  if (g_producerEglStream != EGL_NO_STREAM_KHR) {
+    eglDestroyStreamKHR(g_producerEglDisplay, g_producerEglStream);
+  }
+  if (g_consumerEglStream != g_producerEglStream) {
+    if (g_consumerEglStream != EGL_NO_STREAM_KHR) {
+      eglDestroyStreamKHR(g_consumerEglDisplay, g_consumerEglStream);
+    }
+  }
+}
+
+int UnixSocketConnect(const char *socket_name) {
+  int sock_fd = -1;
+  struct sockaddr_un sock_addr;
+  int wait_loop = 0;
+
+  sock_fd = socket(PF_UNIX, SOCK_STREAM, 0);
+  if (sock_fd < 0) {
+    printf("%s: socket create failed.\n", __func__);
+    return -1;
+  }
+
+  if (verbose) printf("%s: send_fd: sock_fd: %d\n", __func__, sock_fd);
+
+  memset(&sock_addr, 0, sizeof(struct sockaddr_un));
+  sock_addr.sun_family = AF_UNIX;
+  strncpy(sock_addr.sun_path, socket_name, sizeof(sock_addr.sun_path) - 1);
+
+  while (connect(sock_fd, (const struct sockaddr *)&sock_addr,
+                 sizeof(struct sockaddr_un))) {
+    if (wait_loop < 60) {
+      if (!wait_loop)
+        printf("Waiting for EGL stream producer ");
+      else
+        printf(".");
+      fflush(stdout);
+      sleep(1);
+      wait_loop++;
+    } else {
+      printf("\n%s: Waiting timed out\n", __func__);
+      return -1;
+    }
+  }
+  if (wait_loop) printf("\n");
+
+  if (verbose) printf("%s: Wait is done\n", __func__);
+
+  return sock_fd;
+}
+
+/* Send <fd_to_send> (a file descriptor) to another process */
+/* over a unix domain socket named <socket_name>.           */
+/* <socket_name> can be any nonexistant filename.           */
+int EGLStreamSendfd(int send_fd, int fd_to_send) {
+  struct msghdr msg;
+  struct iovec iov[1];
+  char ctrl_buf[CMSG_SPACE(sizeof(int))];
+  struct cmsghdr *cmsg = NULL;
+  void *data;
+  int res;
+  memset(&msg, 0, sizeof(msg));
+
+  iov[0].iov_len = 1;             // must send at least 1 byte
+  iov[0].iov_base = (void *)"x";  // any byte value (value ignored)
+  msg.msg_iov = iov;
+  msg.msg_iovlen = 1;
+
+  memset(ctrl_buf, 0, sizeof(ctrl_buf));
+  msg.msg_control = ctrl_buf;
+  msg.msg_controllen = sizeof(ctrl_buf);
+
+  cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_RIGHTS;
+  cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+  data = CMSG_DATA(cmsg);
+  *(int *)data = fd_to_send;
+
+  msg.msg_controllen = cmsg->cmsg_len;
+
+  res = sendmsg(send_fd, &msg, 0);
+  if (res <= 0) {
+    printf("%s: sendmsg failed", __func__);
+    return -1;
+  }
+
+  return 0;
+}
+
+/* Listen on a unix domain socket named <socket_name>.     */
+/* Connect to it and return connect_fd                     */
+int UnixSocketCreate(const char *socket_name) {
+  int listen_fd;
+  struct sockaddr_un sock_addr;
+  int connect_fd;
+  struct sockaddr_un connect_addr;
+  socklen_t connect_addr_len = 0;
+
+  listen_fd = socket(PF_UNIX, SOCK_STREAM, 0);
+  if (listen_fd < 0) {
+    printf("%s: socket create failed", __func__);
+    return -1;
+  }
+
+  if (verbose) printf("%s: listen_fd: %d\n", __func__, listen_fd);
+
+  unlink(socket_name);
+
+  memset(&sock_addr, 0, sizeof(struct sockaddr_un));
+  sock_addr.sun_family = AF_UNIX;
+  strncpy(sock_addr.sun_path, socket_name, sizeof(sock_addr.sun_path) - 1);
+
+  if (bind(listen_fd, (const struct sockaddr *)&sock_addr,
+           sizeof(struct sockaddr_un))) {
+    printf("i%s: bind error", __func__);
+    return -1;
+  }
+
+  if (listen(listen_fd, 1)) {
+    printf("%s: listen error", __func__);
+    return -1;
+  }
+
+  connect_fd =
+      accept(listen_fd, (struct sockaddr *)&connect_addr, &connect_addr_len);
+
+  if (verbose) printf("%s: connect_fd: %d\n", __func__, connect_fd);
+
+  close(listen_fd);
+  unlink(socket_name);
+  if (connect_fd < 0) {
+    printf("%s: accept failed\n", __func__);
+    return -1;
+  }
+
+  return connect_fd;
+}
+
+/* receive a file descriptor from another process.         */
+/* Returns the file descriptor.  Note: the integer value   */
+/* of the file descriptor may be different from the        */
+/* integer value in the other process, but the file        */
+/* descriptors in each process will refer to the same file */
+/* object in the kernel.                                   */
+int EGLStreamReceivefd(int connect_fd) {
+  struct msghdr msg;
+  struct iovec iov[1];
+  char msg_buf[1];
+  char ctrl_buf[CMSG_SPACE(sizeof(int))];
+  struct cmsghdr *cmsg;
+  void *data;
+  int recvfd;
+
+  memset(&msg, 0, sizeof(msg));
+
+  iov[0].iov_base = msg_buf;
+  iov[0].iov_len = sizeof(msg_buf);
+  msg.msg_iov = iov;
+  msg.msg_iovlen = 1;
+
+  msg.msg_control = ctrl_buf;
+  msg.msg_controllen = sizeof(ctrl_buf);
+
+  if (recvmsg(connect_fd, &msg, 0) <= 0) {
+    printf("%s: recvmsg failed", __func__);
+    return -1;
+  }
+
+  cmsg = CMSG_FIRSTHDR(&msg);
+  if (!cmsg) {
+    printf("%s: NULL message header\n", __func__);
+    return -1;
+  }
+  if (cmsg->cmsg_level != SOL_SOCKET) {
+    printf("%s: Message level is not SOL_SOCKET\n", __func__);
+    return -1;
+  }
+  if (cmsg->cmsg_type != SCM_RIGHTS) {
+    printf("%s: Message type is not SCM_RIGHTS\n", __func__);
+    return -1;
+  }
+
+  data = CMSG_DATA(cmsg);
+  recvfd = *(int *)data;
+
+  return recvfd;
+}
+
+#endif
--- a/Samples/EGLStream_CUDA_CrossGPU/eglstrm_common.h
+++ b/Samples/EGLStream_CUDA_CrossGPU/eglstrm_common.h
@ -0,0 +1,113 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   Common EGL stream functions header file
+//
+
+#ifndef _EGLSTRM_COMMON_H_
+#define _EGLSTRM_COMMON_H_
+
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "cuda.h"
+#include "cudaEGL.h"
+#define TIME_DIFF(end, start) (getMicrosecond(end) - getMicrosecond(start))
+
+extern EGLStreamKHR g_producerEglStream;
+extern EGLStreamKHR g_consumerEglStream;
+extern EGLDisplay g_producerEglDisplay;
+extern EGLDisplay g_consumerEglDisplay;
+extern int cudaDevIndexCons;
+extern int cudaDevIndexProd;
+extern bool verbose;
+
+#define EXTENSION_LIST(T)                                                \
+  T(PFNEGLCREATESTREAMKHRPROC, eglCreateStreamKHR)                       \
+  T(PFNEGLDESTROYSTREAMKHRPROC, eglDestroyStreamKHR)                     \
+  T(PFNEGLQUERYSTREAMKHRPROC, eglQueryStreamKHR)                         \
+  T(PFNEGLQUERYSTREAMU64KHRPROC, eglQueryStreamu64KHR)                   \
+  T(PFNEGLQUERYSTREAMTIMEKHRPROC, eglQueryStreamTimeKHR)                 \
+  T(PFNEGLSTREAMATTRIBKHRPROC, eglStreamAttribKHR)                       \
+  T(PFNEGLSTREAMCONSUMERACQUIREKHRPROC, eglStreamConsumerAcquireKHR)     \
+  T(PFNEGLSTREAMCONSUMERRELEASEKHRPROC, eglStreamConsumerReleaseKHR)     \
+  T(PFNEGLSTREAMCONSUMERGLTEXTUREEXTERNALKHRPROC,                        \
+    eglStreamConsumerGLTextureExternalKHR)                               \
+  T(PFNEGLQUERYDEVICESEXTPROC, eglQueryDevicesEXT)                       \
+  T(PFNEGLGETPLATFORMDISPLAYEXTPROC, eglGetPlatformDisplayEXT)           \
+  T(PFNEGLGETSTREAMFILEDESCRIPTORKHRPROC, eglGetStreamFileDescriptorKHR) \
+  T(PFNEGLQUERYDEVICEATTRIBEXTPROC, eglQueryDeviceAttribEXT)             \
+  T(PFNEGLCREATESTREAMFROMFILEDESCRIPTORKHRPROC,                         \
+    eglCreateStreamFromFileDescriptorKHR)
+
+#define EXTLST_DECL(tx, x) tx x = NULL;
+#define EXTLST_EXTERN(tx, x) extern tx x;
+#define EXTLST_ENTRY(tx, x) {(extlst_fnptr_t *)&x, #x},
+
+#define MAX_STRING_SIZE 256
+#define INIT_DATA 0x01
+#define PROD_DATA 0x07
+#define CONS_DATA 0x04
+
+#define SOCK_PATH "/tmp/tegra_sw_egl_socket"
+
+typedef struct _TestArgs {
+  unsigned int charCnt;
+  bool isProducer;
+} TestArgs;
+
+extern int WIDTH, HEIGHT;
+
+int eglSetupExtensions(bool is_dgpu);
+int EGLStreamInit(bool isCrossDevice, int isConsumer,
+                  EGLNativeFileDescriptorKHR fileDesc);
+void EGLStreamFini(void);
+
+int EGLStreamSetAttr(EGLDisplay display, EGLStreamKHR eglStream);
+int UnixSocketConnect(const char *socket_name);
+int EGLStreamSendfd(int send_fd, int fd_to_send);
+int UnixSocketCreate(const char *socket_name);
+int EGLStreamReceivefd(int connect_fd);
+
+static clockid_t clock_id = CLOCK_MONOTONIC;  // CLOCK_PROCESS_CPUTIME_ID;
+static double getMicrosecond(struct timespec t) {
+  return ((t.tv_sec) * 1000000.0 + (t.tv_nsec) / 1.0e3);
+}
+
+static inline void getTime(struct timespec *t) { clock_gettime(clock_id, t); }
+#endif
--- a/Samples/EGLStream_CUDA_CrossGPU/findegl.mk
+++ b/Samples/EGLStream_CUDA_CrossGPU/findegl.mk
@ -0,0 +1,160 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+#  findegl.mk is used to find the necessary EGL Libraries for specific distributions
+#               this is supported on Linux
+#
+################################################################################
+
+# Determine OS platform and unix distribution
+ifeq ("$(TARGET_OS)","linux")
+   # first search lsb_release
+   DISTRO  = $(shell lsb_release -i -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+   ifeq ("$(DISTRO)","")
+     # second search and parse /etc/issue
+     DISTRO = $(shell more /etc/issue | awk '{print $$1}' | sed '1!d' | sed -e "/^$$/d" 2>/dev/null | tr "[:upper:]" "[:lower:]")
+     # ensure data from /etc/issue is valid
+     ifeq (,$(filter $(DISTRO),ubuntu fedora red rhel centos suse))
+       DISTRO = 
+     endif
+     ifeq ("$(DISTRO)","")
+       # third, we can search in /etc/os-release or /etc/{distro}-release
+       DISTRO = $(shell awk '/ID/' /etc/*-release | sed 's/ID=//' | grep -v "VERSION" | grep -v "ID" | grep -v "DISTRIB")
+     endif
+   endif
+endif
+
+ifeq ("$(TARGET_OS)","linux")
+    # $(info) >> findegl.mk -> LINUX path <<<)
+    # Each set of Linux Distros have different paths for where to find their OpenGL libraries reside
+    UBUNTU = $(shell echo $(DISTRO) | grep -i ubuntu      >/dev/null 2>&1; echo $$?)
+    FEDORA = $(shell echo $(DISTRO) | grep -i fedora      >/dev/null 2>&1; echo $$?)
+    RHEL   = $(shell echo $(DISTRO) | grep -i 'red\|rhel' >/dev/null 2>&1; echo $$?)
+    CENTOS = $(shell echo $(DISTRO) | grep -i centos      >/dev/null 2>&1; echo $$?)
+    SUSE   = $(shell echo $(DISTRO) | grep -i 'suse\|sles' >/dev/null 2>&1; echo $$?)
+    ifeq ("$(UBUNTU)","0")
+      ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        GLPATH := /usr/arm-linux-gnueabihf/lib
+        GLLINK := -L/usr/arm-linux-gnueabihf/lib
+        ifneq ($(TARGET_FS),) 
+          GLPATH += $(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+          GLLINK += -L$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+      else ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-aarch64)
+        GLPATH := /usr/aarch64-linux-gnu/lib
+        GLLINK := -L/usr/aarch64-linux-gnu/lib
+        ifneq ($(TARGET_FS),)
+          GLPATH += $(TARGET_FS)/usr/lib
+          GLPATH += $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+          GLLINK += -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+        endif 
+      else
+        UBUNTU_PKG_NAME = $(shell which dpkg >/dev/null 2>&1 && dpkg -l 'nvidia-*' | grep '^ii' | awk '{print $$2}' | head -1)
+        ifneq ("$(UBUNTU_PKG_NAME)","")
+          GLPATH    ?= /usr/lib/$(UBUNTU_PKG_NAME)
+          GLLINK    ?= -L/usr/lib/$(UBUNTU_PKG_NAME)
+        endif
+
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+    ifeq ("$(SUSE)","0")
+      GLPATH    ?= /usr/X11R6/lib64
+      GLLINK    ?= -L/usr/X11R6/lib64
+      DFLT_PATH ?= /usr/lib64
+    endif
+    ifeq ("$(FEDORA)","0")
+      GLPATH    ?= /usr/lib64/nvidia
+      GLLINK    ?= -L/usr/lib64/nvidia
+      DFLT_PATH ?= /usr/lib64
+    endif
+    ifeq ("$(RHEL)","0")
+      GLPATH    ?= /usr/lib64/nvidia
+      GLLINK    ?= -L/usr/lib64/nvidia
+      DFLT_PATH ?= /usr/lib64
+    endif
+    ifeq ("$(CENTOS)","0")
+      GLPATH    ?= /usr/lib64/nvidia
+      GLLINK    ?= -L/usr/lib64/nvidia
+      DFLT_PATH ?= /usr/lib64
+    endif
+
+  EGLLIB  := $(shell find -L $(GLPATH) $(DFLT_PATH) -name libEGL.so    -print 2>/dev/null)
+
+  ifeq ("$(EGLLIB)","")
+      $(info >>> WARNING - libEGL.so not found, please install libEGL.so <<<)
+      SAMPLE_ENABLED := 0
+  endif
+
+  HEADER_SEARCH_PATH ?= $(TARGET_FS)/usr/include
+  ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+      HEADER_SEARCH_PATH += /usr/arm-linux-gnueabihf/include
+  else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-aarch64-linux)
+      HEADER_SEARCH_PATH += /usr/aarch64-linux-gnu/include
+  endif
+
+  EGLHEADER  := $(shell find -L $(HEADER_SEARCH_PATH) -name egl.h -print 2>/dev/null)
+  EGLEXTHEADER  := $(shell find -L $(HEADER_SEARCH_PATH) -name eglext.h -print 2>/dev/null)
+
+  ifeq ("$(EGLHEADER)","")
+      $(info >>> WARNING - egl.h not found, please install egl.h <<<)
+      SAMPLE_ENABLED := 0
+  endif
+  ifeq ("$(EGLEXTHEADER)","")
+      $(info >>> WARNING - eglext.h not found, please install eglext.h <<<)
+      SAMPLE_ENABLED := 0
+  endif
+else
+endif
+
+ifeq ("$(TARGET_OS)","qnx")
+    HOST_CCFLAGS := -V5.4.0,gcc_ntoaarch64le
+endif
+
+# Attempt to compile a minimal EGL application and run to check if EGL_SUPPORT_REUSE_NV is supported in the EGL headers available.
+ifneq ($(SAMPLE_ENABLED), 0)
+      $(shell printf "#include <EGL/egl.h>\n#include <EGL/eglext.h>\nint main() {\n#ifdef EGL_SUPPORT_REUSE_NV \n #error \"Compatible EGL header found\" \n  return 0;\n#endif \n return 1;\n}"  > test.c; )
+      EGL_DEFINES := $(shell $(HOST_COMPILER) $(HOST_CCFLAGS) $(CCFLAGS) $(EXTRA_CCFLAGS) -lEGL test.c -c 2>&1 | grep -ic "Compatible EGL header found";)
+      SHOULD_WAIVE := 0
+      ifeq ($(EGL_DEFINES),0)
+        SHOULD_WAIVE := 1
+      endif
+      ifeq ($(SHOULD_WAIVE),1)
+          $(info -----------------------------------------------------------------------------------------------)
+          $(info WARNING - NVIDIA EGL EXTENSIONS are not available in the present EGL headers)
+          $(info -----------------------------------------------------------------------------------------------)
+          $(info   This CUDA Sample cannot be built if the EGL NVIDIA EXTENSIONS like EGL_SUPPORT_REUSE_NV are not supported in EGL headers.)
+          $(info   This will be a dry-run of the Makefile.)
+          $(info   Please install the latest khronos EGL headers and libs to build this sample)
+          $(info -----------------------------------------------------------------------------------------------)
+          SAMPLE_ENABLED := 0
+      endif
+      $(shell rm test.o test.c 2>/dev/null)
+endif
+
--- a/Samples/EGLStream_CUDA_CrossGPU/helper.h
+++ b/Samples/EGLStream_CUDA_CrossGPU/helper.h
@ -0,0 +1,221 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "eglstrm_common.h"
+#if defined(EXTENSION_LIST)
+EXTENSION_LIST(EXTLST_EXTERN)
+#endif
+#include <cuda.h>
+
+int parseCmdLine(int argc, char *argv[], TestArgs *args);
+void printUsage(void);
+int NUMTRIALS = 10;
+int profileAPIs = 0;
+
+bool verbose = 0;
+bool isCrossDevice = 0;
+
+// Parse the command line options. Returns FAILURE on a parse error, SUCCESS
+// otherwise.
+int parseCmdLine(int argc, char *argv[], TestArgs *args) {
+  int i;
+
+  for (i = 1; i < argc; i++) {
+    if (strcmp(argv[i], "-h") == 0) {
+      printUsage();
+      exit(0);
+    } else if (strcmp(argv[i], "-n") == 0) {
+      ++i;
+      if (sscanf(argv[i], "%d", &NUMTRIALS) != 1 || NUMTRIALS <= 0) {
+        printf("Invalid trial count: %s should be > 0\n", argv[i]);
+        return -1;
+      }
+    } else if (strcmp(argv[i], "-profile") == 0) {
+      profileAPIs = 1;
+    } else if (strcmp(argv[i], "-crossdev") == 0) {
+      isCrossDevice = 1;
+    } else if (strcmp(argv[i], "-width") == 0) {
+      ++i;
+      if (sscanf(argv[i], "%d", &WIDTH) != 1 || (WIDTH <= 0)) {
+        printf("Width should be greater than 0\n");
+        return -1;
+      }
+    } else if (strcmp(argv[i], "-height") == 0) {
+      ++i;
+      if (sscanf(argv[i], "%d", &HEIGHT) != 1 || (HEIGHT <= 0)) {
+        printf("Width should be greater than 0\n");
+        return -1;
+      }
+    } else if (0 == strcmp(&argv[i][1], "proctype")) {
+      ++i;
+      if (!strcasecmp(argv[i], "prod")) {
+        args->isProducer = 1;
+      } else if (!strcasecmp(argv[i], "cons")) {
+        args->isProducer = 0;
+      } else {
+        printf("%s: Bad Process Type: %s\n", __func__, argv[i]);
+        return 1;
+      }
+    } else if (strcmp(argv[i], "-v") == 0) {
+      verbose = 1;
+    } else {
+      printf("Unknown option: %s\n", argv[i]);
+      return -1;
+    }
+  }
+
+  if (isCrossDevice) {
+    int deviceCount = 0;
+
+    CUresult error_id = cuInit(0);
+    if (error_id != CUDA_SUCCESS) {
+      printf("cuInit(0) returned %d\n", error_id);
+      printf("Result = FAIL\n");
+      exit(EXIT_FAILURE);
+    }
+
+    error_id = cuDeviceGetCount(&deviceCount);
+    if (error_id != CUDA_SUCCESS) {
+      printf("cuDeviceGetCount returned %d\n", (int)error_id);
+      printf("Result = FAIL\n");
+      exit(EXIT_FAILURE);
+    }
+
+    int iGPUexists = 0;
+    CUdevice dev;
+    for (dev = 0; dev < deviceCount; ++dev) {
+      int integrated = 0;
+      CUresult error_result = cuDeviceGetAttribute(
+          &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
+
+      if (error_result != CUDA_SUCCESS) {
+        printf("cuDeviceGetAttribute returned error : %d\n", (int)error_result);
+        exit(EXIT_FAILURE);
+      }
+
+      if (integrated) {
+        iGPUexists = 1;
+      }
+    }
+
+    if (!iGPUexists) {
+      printf("No Integrated GPU found in the system.\n");
+      printf(
+          "-crossdev option is only supported on systems with an Integrated "
+          "GPU and a Discrete GPU\n");
+      printf("Waiving the execution\n");
+      exit(EXIT_SUCCESS);
+    }
+  }
+
+  if (!eglSetupExtensions(isCrossDevice)) {
+    printf("SetupExtentions failed \n");
+    exit(EXIT_FAILURE);
+  }
+#define MAX_EGL_DEVICES 4
+  EGLDeviceEXT devices[MAX_EGL_DEVICES];
+  EGLint numDevices = 0;
+  EGLBoolean eglStatus =
+      eglQueryDevicesEXT(MAX_EGL_DEVICES, devices, &numDevices);
+  if (eglStatus != EGL_TRUE) {
+    printf("Error querying EGL devices\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (numDevices == 0) {
+    printf("No EGL devices found\n");
+    eglStatus = EGL_FALSE;
+    exit(2);  // EXIT_WAIVED
+  }
+
+  int egl_device_id = 0;
+  for (egl_device_id = 0; egl_device_id < numDevices; egl_device_id++) {
+    EGLAttrib cuda_device;
+    eglStatus = eglQueryDeviceAttribEXT(devices[egl_device_id],
+                                        EGL_CUDA_DEVICE_NV, &cuda_device);
+    if (eglStatus == EGL_TRUE) {
+      break;
+    }
+  }
+
+  if (egl_device_id >= numDevices) {
+    printf("No CUDA Capable EGL Device found.. Waiving execution\n");
+    exit(2);  // EXIT_WAIVED
+  }
+
+  if (isCrossDevice) {
+    if (numDevices == 1) {
+      printf(
+          "Found only one EGL device, cannot setup cross GPU streams. "
+          "Waiving\n");
+      eglStatus = EGL_FALSE;
+      exit(2);  // EXIT_WAIVED
+    }
+  }
+
+  return 0;
+}
+
+void launchProducer(TestArgs *args) {
+  /* Cross-process creation of producer */
+  char argsProducer[1024];
+  char str[256];
+
+  strcpy(argsProducer, "./EGLStream_CUDA_CrossGPU -proctype prod ");
+
+  if (isCrossDevice) {
+    sprintf(str, "-crossdev ");
+    strcat(argsProducer, str);
+  }
+
+  if (verbose) {
+    sprintf(str, "-v ");
+    strcat(argsProducer, str);
+  }
+
+  /*Make the process run in bg*/
+  strcat(argsProducer, "& ");
+
+  printf("\n%s: Crossproc Producer command: %s \n", __func__, argsProducer);
+
+  /*Create crossproc Producer*/
+  system(argsProducer);
+
+  /*Enable crossproc Consumer in the same process */
+  args->isProducer = 0;
+}
+
+void printUsage(void) {
+  printf("Usage:\n");
+  printf("  -h           Print this help message\n");
+  printf("  -n n         Exit after running n trials. Set to 10 by default\n");
+  printf(
+      "  -profile     Profile time taken by ReleaseAPI. Not set by default\n");
+  printf("  -crossdev    Run with producer on idgpu and consumer on dgpu\n");
+  printf("  -dgpu        (same as -crossdev, deprecated)\n");
+  printf("  -v           verbose output\n");
+}
--- a/Samples/EGLStream_CUDA_CrossGPU/kernel.cu
+++ b/Samples/EGLStream_CUDA_CrossGPU/kernel.cu
@ -0,0 +1,140 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// DESCRIPTION:   Simple CUDA consumer rendering sample app
+//
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "eglstrm_common.h"
+
+extern bool isCrossDevice;
+
+__device__ static unsigned int numErrors = 0, errorFound = 0;
+__device__ void checkProducerDataGPU(char *data, int size, char expectedVal,
+                                     int frameNumber) {
+  if ((data[blockDim.x * blockIdx.x + threadIdx.x] != expectedVal) &&
+      (!errorFound)) {
+    printf("Producer FOUND:%d expected: %d at %d for trial %d %d\n",
+           data[blockDim.x * blockIdx.x + threadIdx.x], expectedVal,
+           (blockDim.x * blockIdx.x + threadIdx.x), frameNumber, numErrors);
+    numErrors++;
+    errorFound = 1;
+    return;
+  }
+}
+
+__device__ void checkConsumerDataGPU(char *data, int size, char expectedVal,
+                                     int frameNumber) {
+  if ((data[blockDim.x * blockIdx.x + threadIdx.x] != expectedVal) &&
+      (!errorFound)) {
+    printf("Consumer FOUND:%d expected: %d at %d for trial %d %d\n",
+           data[blockDim.x * blockIdx.x + threadIdx.x], expectedVal,
+           (blockDim.x * blockIdx.x + threadIdx.x), frameNumber, numErrors);
+    numErrors++;
+    errorFound = 1;
+    return;
+  }
+}
+
+__global__ void writeDataToBuffer(char *pSrc, char newVal) {
+  pSrc[blockDim.x * blockIdx.x + threadIdx.x] = newVal;
+}
+
+__global__ void testKernelConsumer(char *pSrc, char size, char expectedVal,
+                                   char newVal, int frameNumber) {
+  checkConsumerDataGPU(pSrc, size, expectedVal, frameNumber);
+}
+
+__global__ void testKernelProducer(char *pSrc, char size, char expectedVal,
+                                   char newVal, int frameNumber) {
+  checkProducerDataGPU(pSrc, size, expectedVal, frameNumber);
+}
+__global__ void getNumErrors(int *numErr) { *numErr = numErrors; }
+
+cudaError_t cudaProducer_filter(cudaStream_t pStream, char *pSrc, int width,
+                                int height, char expectedVal, char newVal,
+                                int frameNumber) {
+  // in case where consumer is on dgpu and producer is on igpu when return is
+  // called the frame is not copied back to igpu. So the consumer changes is not
+  // visible to producer
+  if (isCrossDevice == 0) {
+    testKernelProducer<<<(width * height) / 1024, 1024, 1, pStream>>>(
+        pSrc, width * height, expectedVal, newVal, frameNumber);
+  }
+  writeDataToBuffer<<<(width * height) / 1024, 1024, 1, pStream>>>(pSrc,
+                                                                   newVal);
+  return cudaSuccess;
+};
+
+cudaError_t cudaConsumer_filter(cudaStream_t cStream, char *pSrc, int width,
+                                int height, char expectedVal, char newVal,
+                                int frameNumber) {
+  testKernelConsumer<<<(width * height) / 1024, 1024, 1, cStream>>>(
+      pSrc, width * height, expectedVal, newVal, frameNumber);
+  writeDataToBuffer<<<(width * height) / 1024, 1024, 1, cStream>>>(pSrc,
+                                                                   newVal);
+  return cudaSuccess;
+};
+
+cudaError_t cudaGetValueMismatch() {
+  int numErr_h;
+  int *numErr_d = NULL;
+  cudaError_t err = cudaSuccess;
+  err = cudaMalloc(&numErr_d, sizeof(int));
+  if (err != cudaSuccess) {
+    printf("Cuda Main: cudaMalloc failed with %s\n", cudaGetErrorString(err));
+    return err;
+  }
+  getNumErrors<<<1, 1>>>(numErr_d);
+  err = cudaDeviceSynchronize();
+  if (err != cudaSuccess) {
+    printf("Cuda Main: cudaDeviceSynchronize failed with %s\n",
+           cudaGetErrorString(err));
+  }
+  err = cudaMemcpy(&numErr_h, numErr_d, sizeof(int), cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess) {
+    printf("Cuda Main: cudaMemcpy failed with %s\n", cudaGetErrorString(err));
+    cudaFree(numErr_d);
+    return err;
+  }
+  err = cudaFree(numErr_d);
+  if (err != cudaSuccess) {
+    printf("Cuda Main: cudaFree failed with %s\n", cudaGetErrorString(err));
+    return err;
+  }
+  if (numErr_h > 0) {
+    return cudaErrorUnknown;
+  }
+  return cudaSuccess;
+}
--- a/Samples/EGLStream_CUDA_CrossGPU/main.cpp
+++ b/Samples/EGLStream_CUDA_CrossGPU/main.cpp
@ -0,0 +1,392 @@
+/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cudaEGL.h"
+#include "cuda_consumer.h"
+#include "cuda_producer.h"
+#include "eglstrm_common.h"
+#include "helper.h"
+#if defined(EXTENSION_LIST)
+EXTENSION_LIST(EXTLST_EXTERN)
+#endif
+
+bool signal_stop = 0;
+extern bool verbose;
+
+static void sig_handler(int sig) {
+  signal_stop = 1;
+  printf("Signal: %d\n", sig);
+}
+
+void DoneCons(int consumerStatus, int send_fd) {
+  EGLStreamFini();
+  // get the final status from producer, combine and print
+  int producerStatus = -1;
+  if (-1 == recv(send_fd, (void *)&producerStatus, sizeof(int), 0)) {
+    printf("%s: Cuda Consumer could not receive status from producer.\n",
+           __func__);
+  }
+  close(send_fd);
+
+  if (producerStatus == 0 && consumerStatus == 0) {
+    printf("&&&& EGLStream_CUDA_CrossGPU PASSED\n");
+    exit(EXIT_SUCCESS);
+  } else {
+    printf("&&&& EGLStream_CUDA_CrossGPU FAILED\n");
+    exit(EXIT_FAILURE);
+  }
+}
+
+void DoneProd(int producerStatus, int connect_fd) {
+  EGLStreamFini();
+  if (-1 == send(connect_fd, (void *)&producerStatus, sizeof(int), 0)) {
+    printf("%s: Cuda Producer could not send status to consumer.\n", __func__);
+  }
+  close(connect_fd);
+  if (producerStatus == 0) {
+    exit(EXIT_SUCCESS);
+  } else {
+    exit(EXIT_FAILURE);
+  }
+}
+
+int WIDTH = 8192, HEIGHT = 8192;
+int main(int argc, char **argv) {
+  TestArgs args = {0, false};
+  CUresult curesult = CUDA_SUCCESS;
+  unsigned int j = 0;
+  cudaError_t err = cudaSuccess;
+  EGLNativeFileDescriptorKHR fileDescriptor = EGL_NO_FILE_DESCRIPTOR_KHR;
+  struct timespec start, end;
+  CUeglFrame cudaEgl1, cudaEgl2;
+  int consumerStatus = 0;
+  int send_fd = -1;
+
+  if (parseCmdLine(argc, argv, &args) < 0) {
+    printUsage();
+    curesult = CUDA_ERROR_UNKNOWN;
+    DoneCons(consumerStatus, send_fd);
+  }
+
+  printf("Width : %u, height: %u and iterations: %u\n", WIDTH, HEIGHT,
+         NUMTRIALS);
+
+  if (!args.isProducer)  // Consumer code
+  {
+    test_cuda_consumer_s cudaConsumer;
+    memset(&cudaConsumer, 0, sizeof(test_cuda_consumer_s));
+    cudaConsumer.profileAPI = profileAPIs;
+
+    // Hook up Ctrl-C handler
+    signal(SIGINT, sig_handler);
+
+    if (!EGLStreamInit(isCrossDevice, !args.isProducer,
+                       EGL_NO_FILE_DESCRIPTOR_KHR)) {
+      printf("EGLStream Init failed.\n");
+      curesult = CUDA_ERROR_UNKNOWN;
+      consumerStatus = -1;
+      DoneCons(consumerStatus, send_fd);
+    }
+
+    cudaConsumer.cudaDevId = cudaDevIndexCons;
+    curesult = cudaDeviceCreateConsumer(&cudaConsumer);
+    if (curesult != CUDA_SUCCESS) {
+      consumerStatus = -1;
+      DoneCons(consumerStatus, send_fd);
+    }
+
+    cuCtxPushCurrent(cudaConsumer.context);
+
+    launchProducer(&args);
+
+    args.charCnt = WIDTH * HEIGHT * 4;
+
+    curesult = cuda_consumer_init(&cudaConsumer, &args);
+    if (curesult != CUDA_SUCCESS) {
+      printf("Cuda Consumer: Init failed, status: %d\n", curesult);
+      consumerStatus = -1;
+      DoneCons(consumerStatus, send_fd);
+    }
+
+    cuCtxPopCurrent(&cudaConsumer.context);
+
+    send_fd = UnixSocketConnect(SOCK_PATH);
+    if (-1 == send_fd) {
+      printf("%s: Cuda Consumer cannot create socket %s\n", __func__,
+             SOCK_PATH);
+      consumerStatus = -1;
+      DoneCons(consumerStatus, send_fd);
+    }
+
+    cuCtxPushCurrent(cudaConsumer.context);
+    cudaConsumer.eglStream = g_consumerEglStream;
+    cudaConsumer.eglDisplay = g_consumerEglDisplay;
+
+    // Send the EGL stream FD to producer
+    fileDescriptor = eglGetStreamFileDescriptorKHR(cudaConsumer.eglDisplay,
+                                                   cudaConsumer.eglStream);
+    if (EGL_NO_FILE_DESCRIPTOR_KHR == fileDescriptor) {
+      printf("%s: Cuda Consumer could not get EGL file descriptor.\n",
+             __func__);
+      eglDestroyStreamKHR(cudaConsumer.eglDisplay, cudaConsumer.eglStream);
+      consumerStatus = -1;
+      DoneCons(consumerStatus, send_fd);
+    }
+
+    if (verbose)
+      printf("%s: Cuda Consumer EGL stream FD obtained : %d.\n", __func__,
+             fileDescriptor);
+
+    int res = -1;
+    res = EGLStreamSendfd(send_fd, fileDescriptor);
+    if (-1 == res) {
+      printf("%s: Cuda Consumer could not send EGL file descriptor.\n",
+             __func__);
+      consumerStatus = -1;
+      close(fileDescriptor);
+    }
+
+    if (CUDA_SUCCESS !=
+        (curesult = cuEGLStreamConsumerConnect(&(cudaConsumer.cudaConn),
+                                               cudaConsumer.eglStream))) {
+      printf("FAILED Connect CUDA consumer with error %d\n", curesult);
+      consumerStatus = -1;
+      DoneCons(consumerStatus, send_fd);
+    }
+
+    j = 0;
+    for (j = 0; j < NUMTRIALS; j++) {
+      curesult = cudaConsumerAcquireFrame(&cudaConsumer, j);
+      if (curesult != CUDA_SUCCESS) {
+        printf("Cuda Consumer Test failed for frame = %d\n", j + 1);
+        consumerStatus = -1;
+        DoneCons(consumerStatus, send_fd);
+      }
+      curesult = cudaConsumerReleaseFrame(&cudaConsumer, j);
+      if (curesult != CUDA_SUCCESS) {
+        printf("Cuda Consumer Test failed for frame = %d\n", j + 1);
+        consumerStatus = -1;
+        DoneCons(consumerStatus, send_fd);
+      }
+
+      curesult = cudaConsumerAcquireFrame(&cudaConsumer, j);
+      if (curesult != CUDA_SUCCESS) {
+        printf("Cuda Consumer Test failed for frame = %d\n", j + 1);
+        consumerStatus = -1;
+        DoneCons(consumerStatus, send_fd);
+      }
+      curesult = cudaConsumerReleaseFrame(&cudaConsumer, j);
+      if (curesult != CUDA_SUCCESS) {
+        printf("Cuda Consumer Test failed for frame = %d\n", j + 1);
+        consumerStatus = -1;
+        DoneCons(consumerStatus, send_fd);
+      }
+    }
+    cuCtxSynchronize();
+    close(fileDescriptor);
+    err = cudaGetValueMismatch();
+    if (err != cudaSuccess) {
+      printf("Consumer: App failed with value mismatch\n");
+      curesult = CUDA_ERROR_UNKNOWN;
+      consumerStatus = -1;
+      DoneCons(consumerStatus, send_fd);
+    }
+
+    EGLint streamState = 0;
+    if (!eglQueryStreamKHR(cudaConsumer.eglDisplay, cudaConsumer.eglStream,
+                           EGL_STREAM_STATE_KHR, &streamState)) {
+      printf("Main, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
+      curesult = CUDA_ERROR_UNKNOWN;
+      consumerStatus = -1;
+      DoneCons(consumerStatus, send_fd);
+    }
+
+    if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) {
+      if (CUDA_SUCCESS != (curesult = cuda_consumer_Deinit(&cudaConsumer))) {
+        printf("Consumer Disconnect FAILED.\n");
+        consumerStatus = -1;
+        DoneCons(consumerStatus, send_fd);
+      }
+    }
+  } else  // Producer
+  {
+    test_cuda_producer_s cudaProducer;
+    memset(&cudaProducer, 0, sizeof(test_cuda_producer_s));
+    cudaProducer.profileAPI = profileAPIs;
+    int producerStatus = 0;
+
+    setenv("CUDA_EGL_PRODUCER_RETURN_WAIT_TIMEOUT", "1600", 0);
+
+    int connect_fd = -1;
+    // Hook up Ctrl-C handler
+    signal(SIGINT, sig_handler);
+
+    // Create connection to Consumer
+    connect_fd = UnixSocketCreate(SOCK_PATH);
+    if (-1 == connect_fd) {
+      printf("%s: Cuda Producer could not create socket: %s.\n", __func__,
+             SOCK_PATH);
+      producerStatus = -1;
+      DoneProd(producerStatus, connect_fd);
+    }
+
+    // Get the file descriptor of the stream from the consumer process
+    // and re-create the EGL stream from it
+    fileDescriptor = EGLStreamReceivefd(connect_fd);
+    if (-1 == fileDescriptor) {
+      printf("%s: Cuda Producer could not receive EGL file descriptor \n",
+             __func__);
+      producerStatus = -1;
+      DoneProd(producerStatus, connect_fd);
+    }
+
+    if (!EGLStreamInit(isCrossDevice, 0, fileDescriptor)) {
+      printf("EGLStream Init failed.\n");
+      producerStatus = -1;
+      curesult = CUDA_ERROR_UNKNOWN;
+      DoneProd(producerStatus, connect_fd);
+    }
+
+    cudaProducer.eglDisplay = g_producerEglDisplay;
+    cudaProducer.eglStream = g_producerEglStream;
+    cudaProducer.cudaDevId = cudaDevIndexProd;
+
+    curesult = cudaDeviceCreateProducer(&cudaProducer);
+    if (curesult != CUDA_SUCCESS) {
+      producerStatus = -1;
+      DoneProd(producerStatus, connect_fd);
+    }
+
+    args.charCnt = WIDTH * HEIGHT * 4;
+    cuCtxPushCurrent(cudaProducer.context);
+    curesult = cudaProducerInit(&cudaProducer, &args);
+    if (curesult != CUDA_SUCCESS) {
+      printf("Cuda Producer: Init failed, status: %d\n", curesult);
+      producerStatus = -1;
+      DoneProd(producerStatus, connect_fd);
+    }
+
+    // wait for consumer to connect first
+    int err = 0;
+    int wait_loop = 0;
+    EGLint streamState = 0;
+    do {
+      err = eglQueryStreamKHR(cudaProducer.eglDisplay, cudaProducer.eglStream,
+                              EGL_STREAM_STATE_KHR, &streamState);
+      if ((0 != err) && (EGL_STREAM_STATE_CONNECTING_KHR != streamState)) {
+        sleep(1);
+        wait_loop++;
+      }
+    } while ((wait_loop < 10) && (0 != err) &&
+             (streamState != EGL_STREAM_STATE_CONNECTING_KHR));
+
+    if ((0 == err) || (wait_loop >= 10)) {
+      printf(
+          "%s: Cuda Producer eglQueryStreamKHR EGL_STREAM_STATE_KHR failed.\n",
+          __func__);
+      producerStatus = -1;
+      DoneProd(producerStatus, connect_fd);
+    }
+
+    if (CUDA_SUCCESS != (curesult = cuEGLStreamProducerConnect(
+                             &(cudaProducer.cudaConn), cudaProducer.eglStream,
+                             WIDTH, HEIGHT))) {
+      printf("Connect CUDA producer FAILED with error %d\n", curesult);
+      producerStatus = -1;
+      DoneProd(producerStatus, connect_fd);
+    }
+
+    printf("main - Cuda Producer and Consumer Initialized.\n");
+
+    cudaProducerPrepareFrame(&cudaEgl1, cudaProducer.cudaPtr, args.charCnt);
+    cudaProducerPrepareFrame(&cudaEgl2, cudaProducer.cudaPtr1, args.charCnt);
+
+    j = 0;
+    for (j = 0; j < NUMTRIALS; j++) {
+      curesult = cudaProducerPresentFrame(&cudaProducer, cudaEgl1, j);
+      if (curesult != CUDA_SUCCESS) {
+        printf("Cuda Producer Test failed for frame = %d with cuda error:%d\n",
+               j + 1, curesult);
+        producerStatus = -1;
+        DoneProd(producerStatus, connect_fd);
+      }
+
+      curesult = cudaProducerPresentFrame(&cudaProducer, cudaEgl2, j);
+      if (curesult != CUDA_SUCCESS) {
+        printf("Cuda Producer Test failed for frame = %d with cuda error:%d\n",
+               j + 1, curesult);
+        producerStatus = -1;
+        DoneProd(producerStatus, connect_fd);
+      }
+
+      curesult = cudaProducerReturnFrame(&cudaProducer, cudaEgl1, j);
+      if (curesult != CUDA_SUCCESS) {
+        printf("Cuda Producer Test failed for frame = %d with cuda error:%d\n",
+               j + 1, curesult);
+        producerStatus = -1;
+        DoneProd(producerStatus, connect_fd);
+      }
+
+      curesult = cudaProducerReturnFrame(&cudaProducer, cudaEgl2, j);
+      if (curesult != CUDA_SUCCESS) {
+        printf("Cuda Producer Test failed for frame = %d with cuda error:%d\n",
+               j + 1, curesult);
+        producerStatus = -1;
+        DoneProd(producerStatus, connect_fd);
+      }
+    }
+
+    cuCtxSynchronize();
+    err = cudaGetValueMismatch();
+    if (err != cudaSuccess) {
+      printf("Prod: App failed with value mismatch\n");
+      curesult = CUDA_ERROR_UNKNOWN;
+      producerStatus = -1;
+      DoneProd(producerStatus, connect_fd);
+    }
+
+    printf("Tear Down Start.....\n");
+    if (!eglQueryStreamKHR(cudaProducer.eglDisplay, cudaProducer.eglStream,
+                           EGL_STREAM_STATE_KHR, &streamState)) {
+      printf("Main, eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
+      curesult = CUDA_ERROR_UNKNOWN;
+      producerStatus = -1;
+      DoneProd(producerStatus, connect_fd);
+    }
+
+    if (streamState != EGL_STREAM_STATE_DISCONNECTED_KHR) {
+      if (CUDA_SUCCESS != (curesult = cudaProducerDeinit(&cudaProducer))) {
+        printf("Producer Disconnect FAILED with %d\n", curesult);
+        producerStatus = -1;
+        DoneProd(producerStatus, connect_fd);
+      }
+    }
+    unsetenv("CUDA_EGL_PRODUCER_RETURN_WAIT_TIMEOUT");
+  }
+
+  return 0;
+}
--- a/Samples/EGLStreams_CUDA_Interop/.vscode/c_cpp_properties.json
+++ b/Samples/EGLStreams_CUDA_Interop/.vscode/c_cpp_properties.json
@ -0,0 +1,18 @@
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/**",
+                "${workspaceFolder}/../../Common"
+            ],
+            "defines": [],
+            "compilerPath": "/usr/local/cuda/bin/nvcc",
+            "cStandard": "gnu17",
+            "cppStandard": "gnu++14",
+            "intelliSenseMode": "linux-gcc-x64",
+            "configurationProvider": "ms-vscode.makefile-tools"
+        }
+    ],
+    "version": 4
+}
--- a/Samples/EGLStreams_CUDA_Interop/.vscode/extensions.json
+++ b/Samples/EGLStreams_CUDA_Interop/.vscode/extensions.json
@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cpptools",
+        "ms-vscode.makefile-tools"
+    ]
+}
--- a/Samples/EGLStreams_CUDA_Interop/.vscode/launch.json
+++ b/Samples/EGLStreams_CUDA_Interop/.vscode/launch.json
@ -0,0 +1,10 @@
+{
+    "configurations": [
+        {
+            "name": "CUDA C++: Launch",
+            "type": "cuda-gdb",
+            "request": "launch",
+            "program": "${workspaceFolder}/EGLStream_CUDA_Interop"
+        }
+    ]
+}
--- a/Samples/EGLStreams_CUDA_Interop/.vscode/tasks.json
+++ b/Samples/EGLStreams_CUDA_Interop/.vscode/tasks.json
@ -0,0 +1,15 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "sample",
+            "type": "shell",
+            "command": "make dbg=1",
+            "problemMatcher": ["$nvcc"],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
--- a/Samples/EGLStreams_CUDA_Interop/Makefile
+++ b/Samples/EGLStreams_CUDA_Interop/Makefile
@ -0,0 +1,433 @@
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += --qpp-config 5.4.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - EGLStream_CUDA_Interop is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - EGLStream_CUDA_Interop is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on android
+ifeq ($(TARGET_OS),android)
+  $(info >>> WARNING - EGLStream_CUDA_Interop is not supported on android - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Makefile include to help find EGL Libraries
+include ./findegl.mk
+
+# EGL specific libraries
+ifneq ($(TARGET_OS),darwin)
+ LIBRARIES += -lEGL
+endif
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 51000)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to 5.1.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is 5.1.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
+ifeq ($(TARGET_OS),darwin)
+  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
+else
+  ifeq ($(TARGET_ARCH),x86_64)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
+  endif
+
+  ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
+  endif
+
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
+  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
+  ifeq ("$(CUDALIB)","")
+    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
+    SAMPLE_ENABLED := 0
+  else
+    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
+    LIBRARIES += -L$(CUDALIB) -lcuda
+  endif
+endif
+
+ALL_CCFLAGS += --threads 0 --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: EGLStream_CUDA_Interop
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+cuda_consumer.o:cuda_consumer.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+cuda_producer.o:cuda_producer.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+eglstrm_common.o:eglstrm_common.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+main.o:main.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+EGLStream_CUDA_Interop: cuda_consumer.o cuda_producer.o eglstrm_common.o main.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./EGLStream_CUDA_Interop
+
+clean:
+	rm -f EGLStream_CUDA_Interop cuda_consumer.o cuda_producer.o eglstrm_common.o main.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/EGLStream_CUDA_Interop
+
+clobber: clean
--- a/Show More
+++ b/Show More