Update 1_Utilities/README.md to redirect bandwidthTest to NVBandwidth (#371 )

Merge pull request #368 from XSShawnZeng/master
Update the vulkan headers include sequence and the transpose code format check
2025-07-06 00:10:30 +08:00 · 2025-05-22 11:43:14 -07:00 · 2025-05-21 09:27:13 -07:00 · 2025-05-19 17:43:08 +08:00 · 2025-05-19 17:38:42 +08:00 · 2025-05-19 17:38:22 +08:00
2926 changed files with 127291 additions and 371137 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,49 @@
 ---
 AccessModifierOffset: -4
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: Consecutive
 AlignConsecutiveDeclarations: Consecutive
 AlignConsecutiveMacros: Consecutive
 AlignEscapedNewlines: Left
 AlignOperands: AlignAfterOperator
 AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: false
 BinPackArguments: false
 BinPackParameters: false
 BraceWrapping:
    AfterClass: true
    AfterControlStatement: false
    AfterExternBlock: true
    AfterFunction: true
    AfterStruct: true
    AfterUnion: true
    BeforeCatch: true
    BeforeElse: true
    IndentBraces: false
 BreakBeforeBraces: Custom
 BreakBeforeConceptDeclarations: true
 BreakBeforeBinaryOperators: NonAssignment
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializers: BeforeComma
 BreakInheritanceList: BeforeComma
 ColumnLimit: 120
 DerivePointerAlignment: false
 FixNamespaceComments: true
 IncludeCategories:
  - Regex:           '^<.*>'
    Priority:        1
  - Regex:           '^".*"'
    Priority:        2
 SortIncludes: true
 IncludeBlocks: Regroup
 IndentWidth: 4
 MaxEmptyLinesToKeep: 2
 PointerAlignment: Right
 SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
 Standard: c++17
 TabWidth: 4
 UseTab: Never
 ...
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
 build
 .vs
 .clangd
 test
 settings.json
 launch.json
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,106 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 ci:
    autofix_commit_msg: |
      [pre-commit.ci] auto code formatting
    autofix_prs: false
    autoupdate_branch: ''
    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
    autoupdate_schedule: quarterly
    skip: []
    submodules: false
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v5.0.0
    hooks:
      - id: end-of-file-fixer
        exclude: |
          (?x)^(
            .*\.raw$|
            .*\.bin$|
            .*\.dat$|
            .*\.nv12$|
            data/.*|
            Common/.*
          )
        files: |
          (?x)^(
            .*\.txt$|
            .*\.md$|
            .*\.cpp$|
            .*\.cxx$|
            .*\.hpp$|
            .*\.h$|
            .*\.cu$|
            .*\.cuh$|
            .*\.py$|
            .*\.json$
          )
      - id: mixed-line-ending
        exclude: |
          (?x)^(
            .*\.raw$|
            .*\.bin$|
            .*\.dat$|
            .*\.nv12$|
            data/.*|
            Common/.*
          )
        files: |
          (?x)^(
            .*\.txt$|
            .*\.md$|
            .*\.cpp$|
            .*\.cxx$|
            .*\.hpp$|
            .*\.h$|
            .*\.cu$|
            .*\.cuh$|
            .*\.py$|
            .*\.json$
          )
      - id: trailing-whitespace
        exclude: |
          (?x)^(
            .*\.raw$|
            .*\.bin$|
            .*\.dat$|
            .*\.nv12$|
            data/.*|
            Common/.*
          )
        files: |
          (?x)^(
            .*\.txt$|
            .*\.md$|
            .*\.cpp$|
            .*\.cxx$|
            .*\.hpp$|
            .*\.h$|
            .*\.cu$|
            .*\.cuh$|
            .*\.py$|
            .*\.json$
          )
  - repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v19.1.6
    hooks:
      - id: clang-format
        types_or: [file]
        files: |
          (?x)^(
            ^.*\.c$|
            ^.*\.cpp$|
            ^.*\.cu$|
            ^.*\.cuh$|
            ^.*\.cxx$|
            ^.*\.h$|
            ^.*\.hpp$|
            ^.*\.inl$|
            ^.*\.mm$
          )
        exclude: |
          (?x)^(
            Common/.*
          )
        args: ["-fallback-style=none", "-style=file", "-i"]
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,86 @@
 ## Changelog
 ### CUDA 12.9
 * Updated toolchain for cross-compilation for Tegra Linux platforms.
 * Added `run_tests.py` utility to exercise all samples. See README.md for details
 * Repository has been updated with consistent code formatting across all samples
 * Many small code tweaks and bug fixes (see commit history for details)
 * Removed the following outdated samples:
  * `1_Utilities`
    * `bandwidthTest` - this sample was out of date and did not produce accurate results. For bandwidth
    testing of NVIDIA GPU platforms, please refer to [NVBandwidth](https://github.com/NVIDIA/nvbandwidth)
 ### CUDA 12.8
 * Updated build system across the repository to CMake. Removed Visual Studio project files and Makefiles.
 * Removed the following outdated samples:
    * `0_Introduction`
        * `c++11_cuda` demonstrating CUDA and C++ 11 interoperability (reason: obsolete)
        * `concurrentKernels` demonstrating the ability to run multiple kernels simultaneously (reason: obsolete)
        * `cppIntegration` demonstrating calling between .cu and .cpp files (reason: obsolete)
        * `cppOverload` demonstrating C++ function overloading (reason: obsolete)
        * `simpleSeparateCompilation` demonstrating NVCC compilation to a static library (reason: trivial)
        * `simpleTemplates_nvrtc` demonstrating NVRTC usage for `simpleTemplates` sample (reason: redundant)
        * `simpleVoteIntrinsics_nvrtc` demonstrating NVRTC usage for `simpleVoteIntrinsics` sample (reason: redundant)
    * `2_Concepts_and_Techniques`
        * `cuHook` demonstrating dlsym hooks. (reason: incompatible with modern `glibc`)
    * `4_CUDA_Libraries`
        * `batchedLabelMarkersAndLabelCompressionNPP` demonstrating NPP features (reason: some functionality removed from library)
    * `5_Domain_Specific`
        * Legacy Direct3D 9 and 10 interoperability samples:
            * `fluidsD3D9`
            * `simpleD3D10`
            * `simpleD3D10RenderTarget`
            * `simpleD3D10Texture`
            * `simpleD3D9`
            * `simpleD3D9Texture`
            * `SLID3D10Texture`
            * `VFlockingD3D10`
    * `8_Platform_Specific/Tegra`
        * Temporarily removed the following two samples pending updates:
            * `nbody_screen` demonstrating the nbody sample in QNX
            * `simpleGLES_screen` demonstrating GLES interop in QNX
 * Moved the following Tegra-specific samples to a dedicated subdirectory: `8_Platform_Specific/Tegra`
    * `EGLSync_CUDAEvent_Interop`
    * `cuDLAErrorReporting`
    * `cuDLAHybridMode`
    * `cuDLALayerwiseStatsHybrid`
    * `cuDLALayerwiseStatsStandalone`
    * `cuDLAStandaloneMode`
    * `cudaNvSciBufMultiplanar`
    * `cudaNvSciNvMedia`
    * `fluidsGLES`
    * `nbody_opengles`
    * `simpleGLES`
    * `simpleGLES_EGLOutput`
 ### CUDA 12.5
 ### CUDA 12.4
 * Added graphConditionalNodes Sample
 ### CUDA 12.3
 * Added cuDLA samples
 * Fixed jitLto regression
 ### CUDA 12.2
 * libNVVM samples received updates
 * Fixed jitLto Case issues
 * Enabled HOST_COMPILER flag to the makefiles for GCC which is untested but may still work.
 ### CUDA 12.1
 * Added new sample for Large Kernels
 ### CUDA 12.0
 * Added new flags for JIT compiling
 * Removed deprecated APIs in Hopper Architecture
 ### CUDA 11.6
 * Added new folder structure for samples
 * Added support of Visual Studio 2022 to all samples supported on [Windows](#windows-1).
 * All CUDA samples are now only available on [GitHub](https://github.com/nvidia/cuda-samples). They are no longer available via CUDA toolkit.
 ### CUDA 11.5
 * Added `cuDLAHybridMode`. Demonstrate usage of cuDLA in hybrid mode.
 * Added `cuDLAStandaloneMode`. Demonstrate usage of cuDLA in standalone mode.
@ -114,4 +195,4 @@ This is the first release of CUDA Samples on GitHub:
 *  Added `conjugateGradientMultiBlockCG`. Demonstrates a conjugate gradient solver on GPU using Multi Block Cooperative Groups.
 *  Added `conjugateGradientMultiDeviceCG`. Demonstrates a conjugate gradient solver on multiple GPUs using Multi Device Cooperative Groups, also uses unified memory prefetching and usage hints APIs.
 *  Added `simpleCUBLAS`. Demonstrates how perform GEMM operations using CUBLAS library.
-*  Added `simpleCUFFT`. Demonstrates how perform FFT operations using CUFFT library.
+*  Added `simpleCUFFT`. Demonstrates how perform FFT operations using CUFFT library.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,27 @@
 cmake_minimum_required(VERSION 3.20)
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
 project(cuda-samples LANGUAGES C CXX CUDA)
 find_package(CUDAToolkit REQUIRED)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 if(ENABLE_CUDA_DEBUG)
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
 else()
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
 endif()
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
 add_subdirectory(Samples)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,103 @@
 # Contributing to the CUDA Samples
 Thank you for your interest in contributing to the CUDA Samples!
 ## Getting Started
 1. **Fork & Clone the Repository**:
   Fork the reporistory and clone the fork. For more information, check [GitHub's documentation on forking](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo) and [cloning a repository](https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/cloning-a-repository).
 ## Making Changes
 1. **Create a New Branch**:
   ```bash
   git checkout -b your-feature-branch
   ```
 2. **Make Changes**.
 3. **Build and Test**:
   Ensure changes don't break existing functionality by building and running tests.
   For more details on building and testing, refer to the [Building and Testing](#building-and-testing) section below.
 4. **Commit Changes**:
   ```bash
   git commit -m "Brief description of the change"
   ```
 ## Building and Testing
 For information on building a running tests on the samples, please refer to the main [README](README.md)
 ## Creating a Pull Request
 1. Push changes to your fork
 2. Create a pull request targeting the `master` branch of the original CUDA Samples repository. Refer to [GitHub's documentation](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) for more information on creating a pull request.
 3. Describe the purpose and context of the changes in the pull request description.
 ## Code Formatting (pre-commit hooks)
 The CUDA Samples repository uses [pre-commit](https://pre-commit.com/) to execute all code linters and formatters. These
 tools ensure a consistent coding style throughout the project. Using pre-commit ensures that linter
 versions and options are aligned for all developers. Additionally, there is a CI check in place to
 enforce that committed code follows our standards.
 The linters used by the CUDA Samples are listed in `.pre-commit-config.yaml`.
 For example, C++ and CUDA code is formatted with [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html).
 To use `pre-commit`, install via `conda` or `pip`:
 ```bash
 conda config --add channels conda-forge
 conda install pre-commit
 ```
 ```bash
 pip install pre-commit
 ```
 Then run pre-commit hooks before committing code:
 ```bash
 pre-commit run
 ```
 By default, pre-commit runs on staged files (only changes and additions that will be committed).
 To run pre-commit checks on all files, execute:
 ```bash
 pre-commit run --all-files
 ```
 Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running:
 ```bash
 pre-commit install
 ```
 Now code linters and formatters will be run each time you commit changes.
 You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`, althoguh please note
 that this may result in pull requests being rejected if subsequent checks fail.
 ## Review Process
 Once submitted, maintainers will be automatically assigned to review the pull request. They might suggest changes or improvements. Constructive feedback is a part of the collaborative process, aimed at ensuring the highest quality code.
 For constructive feedback and effective communication during reviews, we recommend following [Conventional Comments](https://conventionalcomments.org/).
 Further recommended reading for successful PR reviews:
 - [How to Do Code Reviews Like a Human (Part One)](https://mtlynch.io/human-code-reviews-1/)
 - [How to Do Code Reviews Like a Human (Part Two)](https://mtlynch.io/human-code-reviews-2/)
 ## Thank You
 Your contributions enhance the CUDA Samples for the entire community. We appreciate your effort and collaboration!
--- a/Common/dynlink_d3d10.h
+++ b/Common/dynlink_d3d10.h
@ -1,294 +0,0 @@
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 //--------------------------------------------------------------------------------------
 // File: dynlink_d3d10.h
 //
 // Shortcut macros and functions for using DX objects
 //
 // Copyright (c) Microsoft Corporation. All rights reserved
 //--------------------------------------------------------------------------------------
 #ifndef _DYNLINK_D3D10_H_
 #define _DYNLINK_D3D10_H_
 // Standard Windows includes
 #include <windows.h>
 #include <initguid.h>
 #include <assert.h>
 #include <wchar.h>
 #include <mmsystem.h>
 #include <commctrl.h> // for InitCommonControls() 
 #include <shellapi.h> // for ExtractIcon()
 #include <new.h>      // for placement new
 #include <shlobj.h>
 #include <math.h>
 #include <limits.h>
 #include <stdio.h>
 // CRT's memory leak detection
 #if defined(DEBUG) || defined(_DEBUG)
 #include <crtdbg.h>
 #endif
 // Direct3D9 includes
 #include <d3d9.h>
 // Direct3D10 includes
 #include <dxgi.h>
 #include <d3d10_1.h>
 #include <d3d10.h>
 // XInput includes
 #include <xinput.h>
 // strsafe.h deprecates old unsecure string functions.  If you
 // really do not want to it to (not recommended), then uncomment the next line
 //#define STRSAFE_NO_DEPRECATE
 #ifndef STRSAFE_NO_DEPRECATE
 #pragma deprecated("strncpy")
 #pragma deprecated("wcsncpy")
 #pragma deprecated("_tcsncpy")
 #pragma deprecated("wcsncat")
 #pragma deprecated("strncat")
 #pragma deprecated("_tcsncat")
 #endif
 #pragma warning( disable : 4996 ) // disable deprecated warning 
 #include <strsafe.h>
 #pragma warning( default : 4996 )
 #include <DirectXMath.h>
 using namespace DirectX;
 //--------------------------------------------------------------------------------------
 // Structs
 //--------------------------------------------------------------------------------------
 struct DXUTD3D9DeviceSettings
 {
    UINT AdapterOrdinal;
    D3DDEVTYPE DeviceType;
    D3DFORMAT AdapterFormat;
    DWORD BehaviorFlags;
    D3DPRESENT_PARAMETERS pp;
 };
 struct DXUTD3D10DeviceSettings
 {
    UINT AdapterOrdinal;
    D3D10_DRIVER_TYPE DriverType;
    UINT Output;
    DXGI_SWAP_CHAIN_DESC sd;
    UINT32 CreateFlags;
    UINT32 SyncInterval;
    DWORD PresentFlags;
    bool AutoCreateDepthStencil; // DXUT will create the a depth stencil resource and view if true
    DXGI_FORMAT AutoDepthStencilFormat;
 };
 enum DXUTDeviceVersion { DXUT_D3D9_DEVICE, DXUT_D3D10_DEVICE };
 struct DXUTDeviceSettings
 {
    DXUTDeviceVersion ver;
    union
    {
        DXUTD3D9DeviceSettings d3d9; // only valid if ver == DXUT_D3D9_DEVICE
        DXUTD3D10DeviceSettings d3d10; // only valid if ver == DXUT_D3D10_DEVICE
    };
 };
 //--------------------------------------------------------------------------------------
 // Error codes
 //--------------------------------------------------------------------------------------
 #define DXUTERR_NODIRECT3D              MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0901)
 #define DXUTERR_NOCOMPATIBLEDEVICES     MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0902)
 #define DXUTERR_MEDIANOTFOUND           MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0903)
 #define DXUTERR_NONZEROREFCOUNT         MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0904)
 #define DXUTERR_CREATINGDEVICE          MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0905)
 #define DXUTERR_RESETTINGDEVICE         MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0906)
 #define DXUTERR_CREATINGDEVICEOBJECTS   MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0907)
 #define DXUTERR_RESETTINGDEVICEOBJECTS  MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0908)
 #define DXUTERR_DEVICEREMOVED           MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x090A)
 typedef HRESULT(WINAPI *LPCREATEDXGIFACTORY)(REFIID, void **);
 typedef HRESULT(WINAPI *LPD3D10CREATEDEVICE)(IDXGIAdapter *, D3D10_DRIVER_TYPE, HMODULE, UINT, UINT32,
                                             ID3D10Device **);
 typedef HRESULT(WINAPI *LPD3D10CREATEDEVICE1)(IDXGIAdapter *, D3D10_DRIVER_TYPE, HMODULE, UINT,
                                              D3D10_FEATURE_LEVEL1, UINT, ID3D10Device1 **);
 typedef HRESULT(WINAPI *LPD3D10CREATESTATEBLOCK)(ID3D10Device *pDevice, D3D10_STATE_BLOCK_MASK *pStateBlockMask,
                                                 ID3D10StateBlock **ppStateBlock);
 typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKUNION)(D3D10_STATE_BLOCK_MASK *pA, D3D10_STATE_BLOCK_MASK *pB,
                                                    D3D10_STATE_BLOCK_MASK *pResult);
 typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKINTERSECT)(D3D10_STATE_BLOCK_MASK *pA, D3D10_STATE_BLOCK_MASK *pB,
                                                        D3D10_STATE_BLOCK_MASK *pResult);
 typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKDIFFERENCE)(D3D10_STATE_BLOCK_MASK *pA, D3D10_STATE_BLOCK_MASK *pB,
                                                         D3D10_STATE_BLOCK_MASK *pResult);
 typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKENABLECAPTURE)(D3D10_STATE_BLOCK_MASK *pMask,
                                                            D3D10_DEVICE_STATE_TYPES StateType, UINT RangeStart,
                                                            UINT RangeLength);
 typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKDISABLECAPTURE)(D3D10_STATE_BLOCK_MASK *pMask,
        D3D10_DEVICE_STATE_TYPES StateType, UINT RangeStart,
        UINT RangeLength);
 typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKENABLEALL)(D3D10_STATE_BLOCK_MASK *pMask);
 typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKDISABLEALL)(D3D10_STATE_BLOCK_MASK *pMask);
 typedef BOOL (WINAPI *LPD3D10STATEBLOCKMASKGETSETTING)(D3D10_STATE_BLOCK_MASK *pMask,
                                                       D3D10_DEVICE_STATE_TYPES StateType, UINT Entry);
 typedef HRESULT(WINAPI *LPD3D10COMPILEEFFECTFROMMEMORY)(void *pData, SIZE_T DataLength, LPCSTR pSrcFileName,
                                                        CONST D3D10_SHADER_MACRO *pDefines,
                                                        ID3D10Include *pInclude, UINT HLSLFlags, UINT FXFlags,
                                                        ID3D10Blob **ppCompiledEffect, ID3D10Blob **ppErrors);
 typedef HRESULT(WINAPI *LPD3D10CREATEEFFECTFROMMEMORY)(void *pData, SIZE_T DataLength, UINT FXFlags,
                                                       ID3D10Device *pDevice,
                                                       ID3D10EffectPool *pEffectPool,
                                                       ID3D10Effect **ppEffect);
 typedef HRESULT(WINAPI *LPD3D10CREATEEFFECTPOOLFROMMEMORY)(void *pData, SIZE_T DataLength, UINT FXFlags,
                                                           ID3D10Device *pDevice, ID3D10EffectPool **ppEffectPool);
 typedef HRESULT(WINAPI *LPD3D10CREATEDEVICEANDSWAPCHAIN)(IDXGIAdapter *pAdapter,
                                                         D3D10_DRIVER_TYPE DriverType,
                                                         HMODULE Software,
                                                         UINT Flags,
                                                         UINT SDKVersion,
                                                         DXGI_SWAP_CHAIN_DESC *pSwapChainDesc,
                                                         IDXGISwapChain **ppSwapChain,
                                                         ID3D10Device **ppDevice);
 typedef HRESULT(WINAPI *LPD3D10CREATEDEVICEANDSWAPCHAIN1)(IDXGIAdapter *pAdapter,
                                                          D3D10_DRIVER_TYPE DriverType,
                                                          HMODULE Software,
                                                          UINT Flags,
                                                          D3D10_FEATURE_LEVEL1 HardwareLevel,
                                                          UINT SDKVersion,
                                                          DXGI_SWAP_CHAIN_DESC *pSwapChainDesc,
                                                          IDXGISwapChain **ppSwapChain,
                                                          ID3D10Device1 **ppDevice);
 // Module and function pointers
 static HMODULE                              g_hModDXGI = NULL;
 static HMODULE                              g_hModD3D10 = NULL;
 static HMODULE                              g_hModD3D101 = NULL;
 static LPCREATEDXGIFACTORY                  sFnPtr_CreateDXGIFactory = NULL;
 static LPD3D10CREATESTATEBLOCK              sFnPtr_D3D10CreateStateBlock = NULL;
 static LPD3D10CREATEDEVICE                  sFnPtr_D3D10CreateDevice = NULL;
 static LPD3D10CREATEDEVICE1                 sFnPtr_D3D10CreateDevice1 = NULL;
 static LPD3D10STATEBLOCKMASKUNION           sFnPtr_D3D10StateBlockMaskUnion = NULL;
 static LPD3D10STATEBLOCKMASKINTERSECT       sFnPtr_D3D10StateBlockMaskIntersect = NULL;
 static LPD3D10STATEBLOCKMASKDIFFERENCE      sFnPtr_D3D10StateBlockMaskDifference = NULL;
 static LPD3D10STATEBLOCKMASKENABLECAPTURE   sFnPtr_D3D10StateBlockMaskEnableCapture = NULL;
 static LPD3D10STATEBLOCKMASKDISABLECAPTURE  sFnPtr_D3D10StateBlockMaskDisableCapture = NULL;
 static LPD3D10STATEBLOCKMASKENABLEALL       sFnPtr_D3D10StateBlockMaskEnableAll = NULL;
 static LPD3D10STATEBLOCKMASKDISABLEALL      sFnPtr_D3D10StateBlockMaskDisableAll = NULL;
 static LPD3D10STATEBLOCKMASKGETSETTING      sFnPtr_D3D10StateBlockMaskGetSetting = NULL;
 static LPD3D10COMPILEEFFECTFROMMEMORY       sFnPtr_D3D10CompileEffectFromMemory = NULL;
 static LPD3D10CREATEEFFECTFROMMEMORY        sFnPtr_D3D10CreateEffectFromMemory = NULL;
 static LPD3D10CREATEEFFECTPOOLFROMMEMORY    sFnPtr_D3D10CreateEffectPoolFromMemory = NULL;
 static LPD3D10CREATEDEVICEANDSWAPCHAIN      sFnPtr_D3D10CreateDeviceAndSwapChain  = NULL;
 static LPD3D10CREATEDEVICEANDSWAPCHAIN1     sFnPtr_D3D10CreateDeviceAndSwapChain1 = NULL;
 // unload the D3D10 DLLs
 static bool dynlinkUnloadD3D10API(void)
 {
    if (g_hModD3D10)
    {
        FreeLibrary(g_hModD3D10);
        g_hModD3D10 = NULL;
    }
    if (g_hModDXGI)
    {
        FreeLibrary(g_hModDXGI);
        g_hModDXGI = NULL;
    }
    if (g_hModD3D101)
    {
        FreeLibrary(g_hModD3D101);
        g_hModD3D101 = NULL;
    }
    return true;
 }
 // Dynamically load the D3D10 DLLs loaded and map the function pointers
 static bool dynlinkLoadD3D10API(void)
 {
    // First check to see if the D3D10 Library is present.
    // if it succeeds, then we can call GetProcAddress to grab all of the DX10 functions
    g_hModD3D10 = LoadLibrary("d3d10.dll");
    if (g_hModD3D10 != NULL)
    {
        sFnPtr_D3D10CreateStateBlock             = (LPD3D10CREATESTATEBLOCK)           GetProcAddress(g_hModD3D10, "D3D10CreateStateBlock");
        sFnPtr_D3D10CreateDevice                 = (LPD3D10CREATEDEVICE)           GetProcAddress(g_hModD3D10, "D3D10CreateDevice");
        sFnPtr_D3D10StateBlockMaskUnion          = (LPD3D10STATEBLOCKMASKUNION)        GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskUnion");
        sFnPtr_D3D10StateBlockMaskIntersect      = (LPD3D10STATEBLOCKMASKINTERSECT)    GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskIntersect");
        sFnPtr_D3D10StateBlockMaskDifference     = (LPD3D10STATEBLOCKMASKDIFFERENCE)   GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskDifference");
        sFnPtr_D3D10StateBlockMaskEnableCapture  = (LPD3D10STATEBLOCKMASKENABLECAPTURE) GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskEnableCapture");
        sFnPtr_D3D10StateBlockMaskDisableCapture = (LPD3D10STATEBLOCKMASKDISABLECAPTURE)GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskDisableCapture");
        sFnPtr_D3D10StateBlockMaskEnableAll      = (LPD3D10STATEBLOCKMASKENABLEALL)    GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskEnableAll");
        sFnPtr_D3D10StateBlockMaskDisableAll     = (LPD3D10STATEBLOCKMASKDISABLEALL)   GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskDisableAll");
        sFnPtr_D3D10StateBlockMaskGetSetting     = (LPD3D10STATEBLOCKMASKGETSETTING)   GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskGetSetting");
        sFnPtr_D3D10CompileEffectFromMemory      = (LPD3D10COMPILEEFFECTFROMMEMORY)    GetProcAddress(g_hModD3D10, "D3D10CompileEffectFromMemory");
        sFnPtr_D3D10CreateEffectFromMemory       = (LPD3D10CREATEEFFECTFROMMEMORY)     GetProcAddress(g_hModD3D10, "D3D10CreateEffectFromMemory");
        sFnPtr_D3D10CreateEffectPoolFromMemory   = (LPD3D10CREATEEFFECTPOOLFROMMEMORY) GetProcAddress(g_hModD3D10, "D3D10CreateEffectPoolFromMemory");
        sFnPtr_D3D10CreateDeviceAndSwapChain     = (LPD3D10CREATEDEVICEANDSWAPCHAIN)    GetProcAddress(g_hModD3D10, "D3D10CreateDeviceAndSwapChain");
    }
    g_hModDXGI = LoadLibrary("dxgi.dll");
    if (g_hModDXGI)
    {
        sFnPtr_CreateDXGIFactory                 = (LPCREATEDXGIFACTORY)           GetProcAddress(g_hModDXGI , "CreateDXGIFactory");
    }
    // This may fail if this machine isn't Windows Vista SP1 or later
    g_hModD3D101 = LoadLibrary("d3d10_1.dll");
    if (g_hModD3D101 != NULL)
    {
        sFnPtr_D3D10CreateDevice1                = (LPD3D10CREATEDEVICE1)              GetProcAddress(g_hModD3D101, "D3D10CreateDevice1");
        sFnPtr_D3D10CreateDeviceAndSwapChain1    = (LPD3D10CREATEDEVICEANDSWAPCHAIN1)   GetProcAddress(g_hModD3D101, "D3D10CreateDeviceAndSwapChain1");
    }
    if (g_hModD3D10 == NULL || g_hModDXGI == NULL || g_hModD3D101 == NULL)
    {
        dynlinkUnloadD3D10API();
        return false;
    }
    return true;
 }
 #endif
--- a/Common/helper_cuda.h
+++ b/Common/helper_cuda.h
@ -666,6 +666,11 @@ inline int _ConvertSMVer2Cores(int major, int minor) {
      {0x80,  64},
      {0x86, 128},
      {0x87, 128},
      {0x89, 128},
      {0x90, 128},
      {0xa0, 128},
      {0xa1, 128},
      {0xc0, 128},
      {-1, -1}};
  int index = 0;
@ -712,6 +717,12 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
      {0x75, "Turing"},
      {0x80, "Ampere"},
      {0x86, "Ampere"},
      {0x87, "Ampere"},
      {0x89, "Ada"},
      {0x90, "Hopper"},
      {0xa0, "Blackwell"},
      {0xa1, "Blackwell"},
      {0xc0, "Blackwell"},
      {-1, "Graphics Device"}};
  int index = 0;
--- a/Common/helper_cuda_drvapi.h
+++ b/Common/helper_cuda_drvapi.h
@ -114,6 +114,11 @@ inline int _ConvertSMVer2CoresDRV(int major, int minor) {
      {0x80,  64},
      {0x86, 128},
      {0x87, 128},
      {0x89, 128},
      {0x90, 128},
      {0xa0, 128},
      {0xa1, 128},
      {0xc0, 128},
      {-1, -1}};
  int index = 0;
@ -236,7 +241,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
      }
      unsigned long long compute_perf =
-          (unsigned long long)(multiProcessorCount * sm_per_multiproc *
+          ((unsigned long long)multiProcessorCount * sm_per_multiproc *
                               clockRate);
      if (compute_perf > max_compute_perf) {
@ -403,4 +408,3 @@ bool inline findFatbinPath(const char *module_file, std::string &module_path, ch
  // end of CUDA Helper Functions
 #endif  // COMMON_HELPER_CUDA_DRVAPI_H_
--- a/Common/helper_multiprocess.cpp
+++ b/Common/helper_multiprocess.cpp
@ -168,7 +168,7 @@ int waitProcess(Process *process) {
 #endif
 }
-#if defined(__linux__)
+#if defined(__linux__) || defined(__QNX__)
 int ipcCreateSocket(ipcHandle *&handle, const char *name,
                    const std::vector<Process> &processes) {
  int server_fd;
@ -262,41 +262,48 @@ int ipcRecvShareableHandle(ipcHandle *handle, ShareableHandle *shHandle) {
  // Union to guarantee alignment requirements for control array
  union {
    struct cmsghdr cm;
-    char control[CMSG_SPACE(sizeof(int))];
+    // This will not work on QNX as QNX CMSG_SPACE calls __cmsg_alignbytes
    // And __cmsg_alignbytes is a runtime function instead of compile-time macros
    // char control[CMSG_SPACE(sizeof(int))]
    char* control;
  } control_un;
  size_t sizeof_control = CMSG_SPACE(sizeof(int)) * sizeof(char);
  control_un.control = (char*) malloc(sizeof_control);
  struct cmsghdr *cmptr;
  ssize_t n;
  int receivedfd;
  char dummy_buffer[1];
  ssize_t sendResult;
  msg.msg_control = control_un.control;
-  msg.msg_controllen = sizeof(control_un.control);
+  msg.msg_controllen = sizeof_control;
  iov[0].iov_base = (void *)dummy_buffer;
  iov[0].iov_len = sizeof(dummy_buffer);
  msg.msg_iov = iov;
  msg.msg_iovlen = 1;
  if ((n = recvmsg(handle->socket, &msg, 0)) <= 0) {
    perror("IPC failure: Receiving data over socket failed");
    free(control_un.control);
    return -1;
  }
  if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) &&
      (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
    if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
      free(control_un.control);
      return -1;
    }
    memmove(&receivedfd, CMSG_DATA(cmptr), sizeof(receivedfd));
    *(int *)shHandle = receivedfd;
  } else {
    free(control_un.control);
    return -1;
  }
  free(control_un.control);
  return 0;
 }
@ -340,9 +347,12 @@ int ipcSendShareableHandle(ipcHandle *handle,
  union {
    struct cmsghdr cm;
-    char control[CMSG_SPACE(sizeof(int))];
+    char* control;
  } control_un;
  size_t sizeof_control = CMSG_SPACE(sizeof(int)) * sizeof(char);
  control_un.control = (char*) malloc(sizeof_control);
  struct cmsghdr *cmptr;
  ssize_t readResult;
  struct sockaddr_un cliaddr;
@ -360,7 +370,7 @@ int ipcSendShareableHandle(ipcHandle *handle,
  int sendfd = (int)shareableHandles[data];
  msg.msg_control = control_un.control;
-  msg.msg_controllen = sizeof(control_un.control);
+  msg.msg_controllen = sizeof_control;
  cmptr = CMSG_FIRSTHDR(&msg);
  cmptr->cmsg_len = CMSG_LEN(sizeof(int));
@ -380,9 +390,11 @@ int ipcSendShareableHandle(ipcHandle *handle,
  ssize_t sendResult = sendmsg(handle->socket, &msg, 0);
  if (sendResult <= 0) {
    perror("IPC failure: Sending data over socket failed");
    free(control_un.control);
    return -1;
  }
  free(control_un.control);
  return 0;
 }
--- a/Common/helper_multiprocess.h
+++ b/Common/helper_multiprocess.h
@ -84,7 +84,7 @@ int waitProcess(Process *process);
 #define checkIpcErrors(ipcFuncResult) \
    if (ipcFuncResult == -1) { fprintf(stderr, "Failure at %u %s\n", __LINE__, __FILE__); exit(EXIT_FAILURE); }
-#if defined(__linux__)
+#if defined(__linux__) || defined(__QNX__)
 struct ipcHandle_st {
    int socket;
    char *socketName;
--- a/Common/helper_string.h
+++ b/Common/helper_string.h
@ -421,6 +421,7 @@ inline char *sdkFindFilePath(const char *filename,
  }
  // File not found
  printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename);
  return 0;
 }
--- a/Common/nvMatrix.h
+++ b/Common/nvMatrix.h
@ -258,7 +258,7 @@ namespace nv
                s[2] = &r3[0];
                s[3] = &r4[0];
-                register int i,j,p,jj;
+                int i,j,p,jj;
                for (i=0; i<4; i++)
                {
--- a/Common/nvrtc_helper.h
+++ b/Common/nvrtc_helper.h
@ -49,6 +49,11 @@
 void compileFileToCUBIN(char *filename, int argc, char **argv, char **cubinResult,
                      size_t *cubinResultSize, int requiresCGheaders) {
  if (!filename) {
    std::cerr << "\nerror: filename is empty for compileFileToCUBIN()!\n";
    exit(1);
  }
  std::ifstream inputFile(filename,
                          std::ios::in | std::ios::binary | std::ios::ate);
@ -111,7 +116,12 @@ void compileFileToCUBIN(char *filename, int argc, char **argv, char **cubinResul
    compileOptions = "--include-path=";
-    std::string path = sdkFindFilePath(HeaderNames, argv[0]);
+    char *strPath = sdkFindFilePath(HeaderNames, argv[0]);
    if (!strPath) {
      std::cerr << "\nerror: header file " << HeaderNames << " not found!\n";
      exit(1);
    }
    std::string path = strPath;
    if (!path.empty()) {
      std::size_t found = path.find(HeaderNames);
      path.erase(found);
@ -120,6 +130,7 @@ void compileFileToCUBIN(char *filename, int argc, char **argv, char **cubinResul
          "\nCooperativeGroups headers not found, please install it in %s "
          "sample directory..\n Exiting..\n",
          argv[0]);
      exit(1);
    }
    compileOptions += path.c_str();
    compileParams[numCompileOptions] = reinterpret_cast<char *>(
--- a/Common/rendercheck_d3d10.cpp
+++ b/Common/rendercheck_d3d10.cpp
@ -1,128 +0,0 @@
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 ////////////////////////////////////////////////////////////////////////////////
 //
 //  Utility funcs to wrap up saving a surface or the back buffer as a PPM file
 //  In addition, wraps up a threshold comparision of two PPMs.
 //
 //  These functions are designed to be used to implement an automated QA testing
 //  for SDK samples.
 //
 //  Author: Bryan Dudash
 //  Email: sdkfeedback@nvidia.com
 //
 // Copyright (c) NVIDIA Corporation. All rights reserved.
 ////////////////////////////////////////////////////////////////////////////////
 #include <helper_functions.h>
 #include <rendercheck_d3d10.h>
 HRESULT CheckRenderD3D10::ActiveRenderTargetToPPM(ID3D10Device *pDevice,
                                                  const char *zFileName) {
  ID3D10RenderTargetView *pRTV = NULL;
  pDevice->OMGetRenderTargets(1, &pRTV, NULL);
  ID3D10Resource *pSourceResource = NULL;
  pRTV->GetResource(&pSourceResource);
  return ResourceToPPM(pDevice, pSourceResource, zFileName);
 }
 HRESULT CheckRenderD3D10::ResourceToPPM(ID3D10Device *pDevice,
                                        ID3D10Resource *pResource,
                                        const char *zFileName) {
  D3D10_RESOURCE_DIMENSION rType;
  pResource->GetType(&rType);
  if (rType != D3D10_RESOURCE_DIMENSION_TEXTURE2D) {
    printf("SurfaceToPPM: pResource is not a 2D texture! Aborting...\n");
    return E_FAIL;
  }
  ID3D10Texture2D *pSourceTexture = (ID3D10Texture2D *)pResource;
  ID3D10Texture2D *pTargetTexture = NULL;
  D3D10_TEXTURE2D_DESC desc;
  pSourceTexture->GetDesc(&desc);
  desc.BindFlags = 0;
  desc.CPUAccessFlags = D3D10_CPU_ACCESS_READ;
  desc.Usage = D3D10_USAGE_STAGING;
  if (FAILED(pDevice->CreateTexture2D(&desc, NULL, &pTargetTexture))) {
    printf(
        "SurfaceToPPM: Unable to create target Texture resoruce! Aborting... "
        "\n");
    return E_FAIL;
  }
  pDevice->CopyResource(pTargetTexture, pSourceTexture);
  D3D10_MAPPED_TEXTURE2D mappedTex2D;
  pTargetTexture->Map(0, D3D10_MAP_READ, 0, &mappedTex2D);
  // Need to convert from dx pitch to pitch=width
  unsigned char *pPPMData = new unsigned char[desc.Width * desc.Height * 4];
  for (unsigned int iHeight = 0; iHeight < desc.Height; iHeight++) {
    memcpy(
        &(pPPMData[iHeight * desc.Width * 4]),
        (unsigned char *)(mappedTex2D.pData) + iHeight * mappedTex2D.RowPitch,
        desc.Width * 4);
  }
  pTargetTexture->Unmap(0);
  // Prepends the PPM header info and bumps byte data afterwards
  sdkSavePPM4ub(zFileName, pPPMData, desc.Width, desc.Height);
  delete[] pPPMData;
  pTargetTexture->Release();
  return S_OK;
 }
 bool CheckRenderD3D10::PPMvsPPM(const char *src_file, const char *ref_file,
                                const char *exec_path, const float epsilon,
                                const float threshold) {
  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
  if (ref_file_path == NULL) {
    printf(
        "CheckRenderD3D10::PPMvsPPM unable to find <%s> in <%s> Aborting "
        "comparison!\n",
        ref_file, exec_path);
    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
           ref_file);
    printf("Aborting comparison!\n");
    printf("  FAILURE!\n");
    return false;
  }
  return (sdkComparePPM(src_file, ref_file_path, epsilon, threshold, true) ==
          true);
 }
--- a/Common/rendercheck_d3d10.h
+++ b/Common/rendercheck_d3d10.h
@ -1,53 +0,0 @@
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #pragma once
 #ifndef _RENDERCHECK_D3D10_H_
 #define _RENDERCHECK_D3D10_H_
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
 #include <d3d10.h>
 class CheckRenderD3D10 {
 public:
  CheckRenderD3D10() {}
  static HRESULT ActiveRenderTargetToPPM(ID3D10Device *pDevice,
                                         const char *zFileName);
  static HRESULT ResourceToPPM(ID3D10Device *pDevice, ID3D10Resource *pResource,
                               const char *zFileName);
  static bool PPMvsPPM(const char *src_file, const char *ref_file,
                       const char *exec_path, const float epsilon,
                       const float threshold = 0.0f);
 };
 #endif
--- a/Common/rendercheck_d3d9.cpp
+++ b/Common/rendercheck_d3d9.cpp
@ -1,167 +0,0 @@
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 ////////////////////////////////////////////////////////////////////////////////
 //
 //  Utility funcs to wrap up savings a surface or the back buffer as a PPM file
 //  In addition, wraps up a threshold comparision of two PPMs.
 //
 //  These functions are designed to be used to implement an automated QA testing
 //  for SDK samples.
 //
 //  Author: Bryan Dudash
 //  Email: sdkfeedback@nvidia.com
 //
 // Copyright (c) NVIDIA Corporation. All rights reserved.
 ////////////////////////////////////////////////////////////////////////////////
 #include <helper_functions.h>
 #include <rendercheck_d3d9.h>
 // originally copied from checkrender_gl.cpp and slightly modified
 bool CheckRenderD3D9::PPMvsPPM(const char *src_file, const char *ref_file,
                               const char *exec_path, const float epsilon,
                               const float threshold) {
  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
  if (ref_file_path == NULL) {
    printf(
        "CheckRenderD3D9::PPMvsPPM unable to find <%s> in <%s> Aborting "
        "comparison!\n",
        ref_file, exec_path);
    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
           ref_file);
    printf("Aborting comparison!\n");
    printf("  FAILURE!\n");
    return false;
  }
  return (sdkComparePPM(src_file, ref_file_path, epsilon, threshold, true) ==
          true);
 };
 HRESULT CheckRenderD3D9::BackbufferToPPM(IDirect3DDevice9 *pDevice,
                                         const char *zFileName) {
  IDirect3DSurface9 *pSurface = NULL;
  if (FAILED(
          pDevice->GetBackBuffer(0, 0, D3DBACKBUFFER_TYPE_MONO, &pSurface))) {
    printf("Unable to get the back buffer.  Aborting...\n");
    return E_FAIL;
  }
  // D3DXSaveSurfaceToFile("C:\\bing.dds",D3DXIFF_DDS,pSurface,NULL,NULL);
  HRESULT hr = S_OK;
  hr = SurfaceToPPM(pDevice, pSurface, zFileName);
  pSurface->Release();
  return hr;
 }
 HRESULT CheckRenderD3D9::SurfaceToPPM(IDirect3DDevice9 *pDevice,
                                      IDirect3DSurface9 *pSurface,
                                      const char *zFileName) {
  D3DSURFACE_DESC pDesc;
  pSurface->GetDesc(&pDesc);
  // $$ For now only support common 8bit formats.  TODO: support for more
  // complex formats via conversion?
  if (!(pDesc.Format == D3DFMT_A8R8G8B8 || pDesc.Format == D3DFMT_X8R8G8B8)) {
    return E_INVALIDARG;
  }
  IDirect3DTexture9 *pTargetTex = NULL;
  if (FAILED(pDevice->CreateTexture(pDesc.Width, pDesc.Height, 1,
                                    D3DUSAGE_DYNAMIC, pDesc.Format,
                                    D3DPOOL_SYSTEMMEM, &pTargetTex, NULL))) {
    printf("Unable to create texture for surface transfer! Aborting...\n");
    return E_FAIL;
  }
  IDirect3DSurface9 *pTargetSurface = NULL;
  if (FAILED(pTargetTex->GetSurfaceLevel(0, &pTargetSurface))) {
    printf("Unable to get surface for surface transfer! Aborting...\n");
    return E_FAIL;
  }
  // This is required because we cannot lock a D3DPOOL_DEAULT surface directly.
  // So, we copy to our sysmem surface.
  if (FAILED(pDevice->GetRenderTargetData(pSurface, pTargetSurface))) {
    printf(
        "Unable to GetRenderTargetData() for surface transfer! Aborting...\n");
    return E_FAIL;
  }
  D3DLOCKED_RECT lockedRect;
  HRESULT hr = pTargetSurface->LockRect(&lockedRect, NULL, 0);
  // Need to convert from dx pitch to pitch=width
  //
  // $ PPM is BGR and not RGB it seems. Saved image looks "funny" in viewer(red
  // and blue swapped), but since ref will be dumped using same method, this is
  // ok.
  //      however, if we want the saved image to be properly colored, then we
  //      can swizzle the color bytes here.
  unsigned char *pPPMData = new unsigned char[pDesc.Width * pDesc.Height * 4];
  for (unsigned int iHeight = 0; iHeight < pDesc.Height; iHeight++) {
 #if 1  // swizzle to implment RGB to BGR conversion.
    for (unsigned int iWidth = 0; iWidth < pDesc.Width; iWidth++) {
      DWORD color = *(DWORD *)((unsigned char *)(lockedRect.pBits) +
                               iHeight * lockedRect.Pitch + iWidth * 4);
      // R<->B, [7:0] <-> [23:16], swizzle
      color = ((color & 0xFF) << 16) | (color & 0xFF00) |
              ((color & 0xFF0000) >> 16) | (color & 0xFF000000);
      memcpy(&(pPPMData[(iHeight * pDesc.Width + iWidth) * 4]),
             (unsigned char *)&color, 4);
    }
 #else
    memcpy(&(pPPMData[iHeight * pDesc.Width * 4]),
           (unsigned char *)(lockedRect.pBits) + iHeight * lockedRect.Pitch,
           pDesc.Width * 4);
 #endif
  }
  pTargetSurface->UnlockRect();
  // Prepends the PPM header info and bumps byte data afterwards
  sdkSavePPM4ub(zFileName, pPPMData, pDesc.Width, pDesc.Height);
  delete[] pPPMData;
  pTargetSurface->Release();
  pTargetTex->Release();
  return S_OK;
 }
--- a/Common/rendercheck_d3d9.h
+++ b/Common/rendercheck_d3d9.h
@ -1,54 +0,0 @@
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #pragma once
 #ifndef _RENDERCHECK_D3D9_H_
 #define _RENDERCHECK_D3D9_H_
 #include <assert.h>
 #include <d3d9.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 class CheckRenderD3D9 {
 public:
  CheckRenderD3D9() {}
  static HRESULT BackbufferToPPM(IDirect3DDevice9 *pDevice,
                                 const char *zFileName);
  static HRESULT SurfaceToPPM(IDirect3DDevice9 *pDevice,
                              IDirect3DSurface9 *pSurface,
                              const char *zFileName);
  static bool PPMvsPPM(const char *src_file, const char *ref_file,
                       const char *exec_path, const float epsilon,
                       const float threshold = 0.0f);
 };
 #endif
--- a/README.md
+++ b/README.md
@ -1,23 +1,20 @@
 # CUDA Samples
-Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads).
+Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.9](https://developer.nvidia.com/cuda-downloads).
 ## Release Notes
 This section describes the release notes for the CUDA Samples on GitHub only.
-### CUDA 11.6
+### Change Log
 * Added new folder structure for samples
 * Added support of Visual Studio 2022 to all samples supported on [Windows](#windows-1).
 * All CUDA samples are now only available on [GitHub](https://github.com/nvidia/cuda-samples). They are no longer available via CUDA toolkit.
-### [older versions...](./CHANGELOG.md)
+### [Revision History](./CHANGELOG.md)
 ## Getting Started
 ### Prerequisites
-Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).
 ### Getting the CUDA Samples
@ -31,43 +28,278 @@ Without using git the easiest way to use these samples is to download the zip fi
 ## Building CUDA Samples
-### Windows
+### Building CUDA Samples
-The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+The CUDA Samples are built using CMake. Follow the instructions below for building on Linux, Windows, and for cross-compilation to Tegra devices.
 ```
 *_vs<version>.sln - for Visual Studio <version>
 ```
 Complete samples solution files exist at parent directory of the repo:
 Each individual sample has its own set of solution files at:
 `<CUDA_SAMPLES_REPO>\Samples\<sample_dir>\`
 To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
 `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/> `$ make TARGET_ARCH=aarch64` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details on cross platform compilation of cuda samples.
 *   **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
    ```
    $ make SMS="50 60"
    ```
-*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+Ensure that CMake (version 3.20 or later) is installed. Install it using your package manager if necessary:
-    ```
+
-    $ make HOST_COMPILER=g++
+e.g.
-    ```
+```sudo apt install cmake```
 Navigate to the root of the cloned repository and create a build directory:
 ```
 mkdir build && cd build
 ```
 Configure the project with CMake:
 ```
 cmake ..
 ```
 Build the samples:
 ```
 make -j$(nproc)
 ```
 Run the samples from their respective directories in the build folder. You can also follow this process from and subdirectory of the samples repo, or from within any individual sample.
 ### Windows
 Language services for CMake are available in Visual Studio 2019 version 16.5 or later, and you can directly import the CUDA samples repository from either the root level or from any
 subdirectory or individual sample.
 To build from the command line, open the `x64 Native Tools Command Prompt for VS` provided with your Visual Studio installation.
 Navigate to the root of the cloned repository and create a build directory:
 ```
 mkdir build && cd build
 ```
 Configure the project with CMake - for example:
 ```
 cmake .. -G "Visual Studio 16 2019" -A x64
 ```
 Open the generated solution file CUDA_Samples.sln in Visual Studio. Build the samples by selecting the desired configuration (e.g., Debug or Release) and pressing F7 (Build Solution).
 Run the samples from the output directories specified in Visual Studio.
 ### Enabling On-GPU Debugging
 NVIDIA GPUs support on-GPU debugging through cuda-gdb. Enabling this may significantly affect application performance as certain compiler optimizations are disabled
 in this configuration, hence it's not on by default. Enablement of on-device debugging is controlled via the `-G` switch to nvcc.
 To enable cuda-gdb for samples builds, define the `ENABLE_CUDA_DEBUG` flag on the CMake command line. For example:
 ```
 cmake -DENABLE_CUDA_DEBUG=True ...
 ```
 ### Platform-Specific Samples
 Some CUDA samples are specific to certain platforms, and require passing flags into CMake to enable. In particular, we define the following platform-specific flags:
 * `BUILD_TEGRA` - for Tegra-specific samples
 To build these samples, set the variables either on the command line or through your CMake GUI. For example:
 ```
 cmake -DBUILD_TEGRA=True ..
 ```
 ### Cross-Compilation for Tegra Platforms
 Install the NVIDIA toolchain and cross-compilation environment for Tegra devices as described in the Tegra Development Guide.
 Ensure that CMake (version 3.20 or later) is installed.
 Navigate to the root of the cloned repository and create a build directory:
 ```
 mkdir build && cd build
 ```
 Configure the project with CMake, specifying the Tegra toolchain file. And you can use -DTARGET_FS to point to the target file system root path for necessary include and library files:
 ```
 cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/toolchain-aarch64-linux.cmake -DTARGET_FS=/path/to/target/system/file/system
 ```
 Build the samples:
 ```
 make -j$(nproc)
 ```
 Transfer the built binaries to the Tegra device and execute them there.
 ### Building for Automotive Linux Platforms
 These platforms require additional information to be passed to CMake on the command line to ensure proper resolution of all necessary include and library files.
 Instead of being in the default location, `/usr/local/cuda/include` or `/usr/local/cuda/lib64`, you must point to architecture-specific paths:
 `/usr/local/cuda/<ARCH>/targets/aarch64-linux/lib`
 and
 `/usr/local/cuda/<ARCH>/include`
 An example build might look like this:
 ```
 $ mkdir build
 $ cd build
 $ cmake -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_LIBRARY_PATH=/usr/local/cuda/orin/lib64/ -DCMAKE_INCLUDE_PATH=/usr/local/cuda/orin/include -DBUILD_TEGRA=True ..
 ```
 ### QNX
 Note that in the current branch sample cross-compilation for QNX is not fully validated. This placeholder will be updated in the
 near future with QNX cross-compilation instructions. In the meantime, if you want to cross-compile for QNX please check out one
 of the previous tags prior to the CMake build system transition in 12.8.
 ## Running All Samples as Tests
 It's important to note that the CUDA samples are _not_ intended as a validation suite for CUDA. They do not cover corner cases, they do not completely cover the
 runtime and driver APIs, are not intended for performance benchmarking, etc. That said, it can sometimes be useful to run all of the samples as a quick sanity check and
 we provide a script to do so, `run_tests.py`.
 This Python3 script finds all executables in a subdirectory you choose, matching application names with command line arguments specified in `test_args.json`. It accepts
 the following command line arguments:
 | Switch     | Purpose                                                                                                        | Example                 |
 | ---------- | -------------------------------------------------------------------------------------------------------------- | ----------------------- |
 | --dir      | Specify the root directory to search for executables (recursively)                                             | --dir ./build/Samples   |
 | --config   | JSON configuration file for executable arguments                                                               | --config test_args.json |
 | --output   | Output directory for test results (stdout saved to .txt files - directory will be created if it doesn't exist) | --output ./test         |
 | --args     | Global arguments to pass to all executables (not currently used)                                               | --args arg_1 arg_2 ...  |
 | --parallel | Number of applications to execute in parallel.                                                                 | --parallel 8            |
 Application configurations are loaded from `test_args.json` and matched against executable names (discarding the `.exe` extension on Windows).
 The script returns 0 on success, or the first non-zero error code encountered during testing on failure. It will also print a condensed list of samples that failed, if any.
 There are three primary modes of configuration:
 **Skip**
 An executable configured with "skip" will not be executed. These generally rely on having attached graphical displays and are not suited to this kind of automation.
 Configuration example:
 ```json
 "fluidsGL": {
    "skip": true
 }
 ```
 You will see:
 ```
 Skipping fluidsGL (marked as skip in config)
 ```
 **Single Run**
 For executables to run one time only with arguments, specify each argument as a list entry. Each entry in the JSON file will be appended to the command line, separated
 by a space.
 All applications execute from their current directory, so all paths are relative to the application's location.
 Note that if an application needs no arguments, this entry is optional. An executable found without a matching entry in the JSON will just run as `./application` from its
 current directory.
 Configuration example:
 ```json
 "ptxgen": {
    "args": [
        "test.ll",
        "-arch=compute_75"
    ]
 }
 ```
 You will see:
 ```
 Running ptxgen
    Command: ./ptxgen test.ll -arch=compute_75
    Test completed with return code 0
 ```
 **Multiple Runs**
 For executables to run multiple times with different command line arguments, specify any number of sets of args within a "runs" list.
 As with single runs, all applications execute from their current directory, so all paths are relative to the application's location.
 Configuration example:
 ```json
 "recursiveGaussian": {
    "runs": [
        {
            "args": [
                "-sigma=10",
                "-file=data/ref_10.ppm"
            ]
        },
        {
            "args": [
                "-sigma=14",
                "-file=data/ref_14.ppm"
            ]
        },
        {
            "args": [
                "-sigma=18",
                "-file=data/ref_18.ppm"
            ]
        },
        {
            "args": [
                "-sigma=22",
                "-file=data/ref_22.ppm"
            ]
        }
    ]
 }
 ```
 You will see:
 ```
 Running recursiveGaussian (run 1/4)
    Command: ./recursiveGaussian -sigma=10 -file=data/ref_10.ppm
    Test completed with return code 0
 Running recursiveGaussian (run 2/4)
    Command: ./recursiveGaussian -sigma=14 -file=data/ref_14.ppm
    Test completed with return code 0
 Running recursiveGaussian (run 3/4)
    Command: ./recursiveGaussian -sigma=18 -file=data/ref_18.ppm
    Test completed with return code 0
 Running recursiveGaussian (run 4/4)
    Command: ./recursiveGaussian -sigma=22 -file=data/ref_22.ppm
    Test completed with return code 0
 ```
 ### Example Usage
 Here is an example set of commands to build and test all of the samples.
 First, build:
 ```bash
 mkdir build
 cd build
 cmake ..
 make -j$(nproc)
 ```
 Now, return to the samples root directory and run the test script:
 ```bash
 cd ..
 python3 run_tests.py --output ./test --dir ./build/Samples --config test_args.json
 ```
 If all applications run successfully, you will see something similar to this (the specific number of samples will depend on your build type
 and system configuration):
 ```
 Test Summary:
 Ran 199 test runs for 180 executables.
 All test runs passed!
 ```
 If some samples fail, you will see something like this:
 ```
 Test Summary:
 Ran 199 test runs for 180 executables.
 Failed runs (2):
  bicubicTexture (run 1/5): Failed (code 1)
  Mandelbrot (run 1/2): Failed (code 1)
 ```
 You can inspect the stdout logs in the output directory (generally `APM_<application_name>.txt` or `APM_<application_name>.run<n>.txt`) to help
 determine what may have gone wrong from the output logs. Please file issues against the samples repository if you believe a sample is failing
 incorrectly on your system.
 ## Samples list
@ -92,6 +324,9 @@ Samples that are specific to domain (Graphics, Finance, Image Processing).
 ### [6. Performance](./Samples/6_Performance/README.md)
 Samples that demonstrate performance optimization.
 ### [7. libNVVM](./Samples/7_libNVVM/README.md)
 Samples that demonstrate the use of libNVVVM and NVVM IR.
 ## Dependencies
 Some CUDA Samples rely on third-party applications and/or libraries, or features provided by the CUDA Toolkit and Driver, to either build or execute. These dependencies are listed below.
@ -108,7 +343,7 @@ These third-party dependencies are required by some CUDA samples. If available,
 FreeImage is an open source imaging library. FreeImage can usually be installed on Linux using your distribution's package manager system. FreeImage can also be downloaded from the FreeImage website.
-To set up FreeImage on a Windows system, extract the FreeImage DLL distribution into the folder `../../../Common/FreeImage/Dist/x64` such that it contains the .h and .lib files. Copy the .dll file to root level `bin/win64/Debug` and `bin/win64/Release` folder.
+To set up FreeImage on a Windows system, extract the FreeImage DLL distribution into the folder `./Common/FreeImage/Dist/x64` such that it contains the .h and .lib files. Copy the .dll file to the Release/ Debug/ execution folder or pass the FreeImage folder when cmake configuring with the `-DFreeImage_INCLUDE_DIR` and `-DFreeImage_LIBRARY` options.
 #### Message Passing Interface
@ -138,9 +373,14 @@ OpenGL ES is an embedded systems graphics library used for 2D and 3D rendering.
 Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan targets high-performance realtime 3D graphics applications such as video games and interactive media across all platforms. On systems which support Vulkan, NVIDIA's Vulkan implementation is provided with the CUDA Driver. For building and running Vulkan applications one needs to install the [Vulkan SDK](https://www.lunarg.com/vulkan-sdk/).
 #### GLFW
 GLFW is a lightweight, open-source library designed for managing OpenGL, OpenGL ES, and Vulkan contexts. It simplifies the process of creating and managing windows, handling user input (keyboard, mouse, and joystick), and working with multiple monitors in a cross-platform manner.
 To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header folder as `-DGLFW_INCLUDE_DIR` and lib folder as `-DGLFW_LIB_DIR` for cmake configuring.
 #### OpenMP
-OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/).
+OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/). For compilers such as clang, `libomp.so` and other components for LLVM must be installed separated. You will also need to set additional flags in your CMake configuration files, such as: `-DOpenMP_CXX_FLAGS="-fopenmp=libomp" -DOpenMP_CXX_LIB_NAMES="omp" -DOpenMP_omp_LIBRARY="/path/to/libomp.so"`.
 #### Screen
@ -246,6 +486,10 @@ FP16 is a 16-bit floating-point format. One bit is used for the sign, five bits
 NVCC support of [C++11 features](https://en.wikipedia.org/wiki/C++11).
 #### CMake
 The libNVVM samples are built using [CMake](https://cmake.org/) 3.10 or later.
 ## Contributors Guide
 We welcome your input on issues and suggestions for samples. At this time we are not accepting contributions from the public, check back here as we evolve our contribution model.
@ -263,4 +507,4 @@ Answers to frequently asked questions about CUDA can be found at http://develope
 ## Attributions
-*   Teapot image is obtained from [Wikimedia](https://en.wikipedia.org/wiki/File:Original_Utah_Teapot.jpg) and is licensed under the Creative Commons [Attribution-Share Alike 2.0](https://creativecommons.org/licenses/by-sa/2.0/deed.en) Generic license. The image is modified for samples use cases.
+*   Teapot image is obtained from [Wikimedia](https://en.wikipedia.org/wiki/File:Original_Utah_Teapot.jpg) and is licensed under the Creative Commons [Attribution-Share Alike 2.0](https://creativecommons.org/licenses/by-sa/2.0/deed.en) Generic license. The image is modified for samples use cases.
--- a/Samples/0_Introduction/CMakeLists.txt
+++ b/Samples/0_Introduction/CMakeLists.txt
@ -0,0 +1,46 @@
 add_subdirectory(UnifiedMemoryStreams)
 add_subdirectory(asyncAPI)
 add_subdirectory(clock)
 add_subdirectory(clock_nvrtc)
 add_subdirectory(cudaOpenMP)
 add_subdirectory(fp16ScalarProduct)
 add_subdirectory(matrixMul)
 add_subdirectory(matrixMulDrv)
 add_subdirectory(matrixMulDynlinkJIT)
 add_subdirectory(matrixMul_nvrtc)
 add_subdirectory(mergeSort)
 add_subdirectory(simpleAWBarrier)
 add_subdirectory(simpleAssert)
 add_subdirectory(simpleAssert_nvrtc)
 add_subdirectory(simpleAtomicIntrinsics)
 add_subdirectory(simpleAtomicIntrinsics_nvrtc)
 add_subdirectory(simpleAttributes)
 add_subdirectory(simpleCUDA2GL)
 add_subdirectory(simpleCallback)
 add_subdirectory(simpleCooperativeGroups)
 add_subdirectory(simpleCubemapTexture)
 add_subdirectory(simpleDrvRuntime)
 add_subdirectory(simpleHyperQ)
 add_subdirectory(simpleIPC)
 add_subdirectory(simpleLayeredTexture)
 add_subdirectory(simpleMPI)
 add_subdirectory(simpleMultiCopy)
 add_subdirectory(simpleMultiGPU)
 add_subdirectory(simpleOccupancy)
 add_subdirectory(simpleP2P)
 add_subdirectory(simplePitchLinearTexture)
 add_subdirectory(simplePrintf)
 add_subdirectory(simpleStreams)
 add_subdirectory(simpleSurfaceWrite)
 add_subdirectory(simpleTemplates)
 add_subdirectory(simpleTexture)
 add_subdirectory(simpleTexture3D)
 add_subdirectory(simpleTextureDrv)
 add_subdirectory(simpleVoteIntrinsics)
 add_subdirectory(simpleZeroCopy)
 add_subdirectory(template)
 add_subdirectory(systemWideAtomics)
 add_subdirectory(vectorAdd)
 add_subdirectory(vectorAddDrv)
 add_subdirectory(vectorAddMMAP)
 add_subdirectory(vectorAdd_nvrtc)
--- a/Samples/0_Introduction/README.md
+++ b/Samples/0_Introduction/README.md
@ -4,24 +4,12 @@
 ### [asyncAPI](./asyncAPI)
 This sample illustrates the usage of CUDA events for both GPU timing and overlapping CPU and GPU execution. Events are inserted into a stream of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can perform computations while GPU is executing (including DMA memcopies between the host and device). CPU can query CUDA events to determine whether GPU has completed tasks.
 ### [c++11_cuda](./c++11_cuda)
 This sample demonstrates C++11 feature support in CUDA. It scans a input text file and prints no. of occurrences of x, y, z, w characters. 
 ### [clock](./clock)
 This example shows how to use the clock function to measure the performance of block of threads of a kernel accurately.
 ### [clock_nvrtc](./clock_nvrtc)
 This example shows how to use the clock function using libNVRTC to measure the performance of block of threads of a kernel accurately.
 ### [concurrentKernels](./concurrentKernels)
 This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.
 ### [cppIntegration](./cppIntegration)
 This example demonstrates how to integrate CUDA into an existing C++ application, i.e. the CUDA entry point on host side is only a function which is called from C++ code and only the file containing this function is compiled with nvcc. It also demonstrates that vector types can be used from cpp.
 ### [cppOverload](./cppOverload)
 This sample demonstrates how to use C++ function overloading on the GPU.
 ### [cudaOpenMP](./cudaOpenMP)
 This sample demonstrates how to use OpenMP API to write an application for multiple GPUs.
@ -106,9 +94,6 @@ Use of Pitch Linear Textures
 ### [simplePrintf](./simplePrintf)
 This basic CUDA Runtime API sample demonstrates how to use the printf function in the device code.
 ### [simpleSeparateCompilation](./simpleSeparateCompilation)
 This sample demonstrates a CUDA 5.0 feature, the ability to create a GPU device static library and use it within another CUDA kernel.  This example demonstrates how to pass in a GPU device function (from the GPU device static library) as a function pointer to be called.  This sample requires devices with compute capability 2.0 or higher.
 ### [simpleStreams](./simpleStreams)
 This sample uses CUDA streams to overlap kernel executions with memory copies between the host and a GPU device.  This sample uses a new CUDA 4.0 feature that supports pinning of generic host memory.  Requires Compute Capability 2.0 or higher.
@ -118,9 +103,6 @@ Simple example that demonstrates the use of 2D surface references (Write-to-Text
 ### [simpleTemplates](./simpleTemplates)
 This sample is a templatized version of the template project. It also shows how to correctly templatize dynamically allocated shared memory arrays.
 ### [simpleTemplates_nvrtc](./simpleTemplates_nvrtc)
 This sample is a templatized version of the template project. It also shows how to correctly templatize dynamically allocated shared memory arrays.
 ### [simpleTexture](./simpleTexture)
 Simple example that demonstrates use of Textures in CUDA.
@ -133,9 +115,6 @@ Simple example that demonstrates use of Textures in CUDA.  This sample uses the
 ### [simpleVoteIntrinsics](./simpleVoteIntrinsics)
 Simple program which demonstrates how to use the Vote (__any_sync, __all_sync) intrinsic instruction in a CUDA kernel.
 ### [simpleVoteIntrinsics_nvrtc](./simpleVoteIntrinsics_nvrtc)
 Simple program which demonstrates how to use the Vote (any, all) intrinsic instruction in a CUDA kernel with runtime compilation using NVRTC APIs. Requires Compute Capability 2.0 or higher.
 ### [simpleZeroCopy](./simpleZeroCopy)
 This sample illustrates how to use Zero MemCopy, kernels can read and write directly to pinned system memory.
@ -159,4 +138,3 @@ This Vector Addition sample is a basic sample that is implemented element by ele
 ### [vectorAddMMAP](./vectorAddMMAP)
 This sample replaces the device allocation in the vectorAddDrv sample with cuMemMap-ed allocations.  This sample demonstrates that the cuMemMap api allows the user to specify the physical properties of their memory while retaining the contiguous nature of their access, thus not requiring a change in their program structure.
--- a/Samples/0_Introduction/UnifiedMemoryStreams/CMakeLists.txt
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/CMakeLists.txt
@ -0,0 +1,45 @@
 cmake_minimum_required(VERSION 3.20)
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
 project(UnifiedMemoryStreams LANGUAGES C CXX CUDA)
 find_package(CUDAToolkit REQUIRED)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 if(ENABLE_CUDA_DEBUG)
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
 else()
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
 endif()
 # Include directories and libraries
 include_directories(../../../Common)
 # Source file
 if(CMAKE_GENERATOR MATCHES "Visual Studio")
    find_package(OpenMP REQUIRED C CXX)
 else()
    find_package(OpenMP REQUIRED)
 endif()
 if(${OpenMP_FOUND})
    # Add target for UnifiedMemoryStreams
    add_executable(UnifiedMemoryStreams UnifiedMemoryStreams.cu)
 target_compile_options(UnifiedMemoryStreams PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
 target_compile_features(UnifiedMemoryStreams PRIVATE cxx_std_17 cuda_std_17)
    set_target_properties(UnifiedMemoryStreams PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
    target_link_libraries(UnifiedMemoryStreams PUBLIC
        CUDA::cublas
        OpenMP::OpenMP_CXX
    )
 else()
    message(STATUS "OpenMP not found - will not build sample 'UnifiedMemoryStreams'")
 endif()
--- a/Samples/0_Introduction/UnifiedMemoryStreams/Makefile
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/Makefile
@ -1,381 +0,0 @@
 ################################################################################
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #  * Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 #  * Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #  * Neither the name of NVIDIA CORPORATION nor the names of its
 #    contributors may be used to endorse or promote products derived
 #    from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
 # Makefile project only supported on Mac OS X and Linux Platforms)
 #
 ################################################################################
 # Location of the CUDA Toolkit
 CUDA_PATH ?= /usr/local/cuda
 ##############################
 # start deprecated interface #
 ##############################
 ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
 endif
 ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
 endif
 ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
 endif
 ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
 endif
 ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
 endif
 ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
 endif
 ############################
 # end deprecated interface #
 ############################
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
 ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
 else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
 # sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
 ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
 endif
 # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
 endif
 # operating system
 HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 TARGET_OS ?= $(HOST_OS)
 ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 endif
 # host compiler
 ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
 else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
 endif
 HOST_COMPILER ?= g++
 NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 # internal flags
 NVCCFLAGS   := -m${TARGET_SIZE}
 CCFLAGS     :=
 LDFLAGS     :=
 # build flags
 ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
 else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
 else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif
        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
 endif
 ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
 endif
 # Install directory of different arch
 CUDA_INSTALL_TARGET_DIR :=
 ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
 else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
 endif
 # Debug build flags
 ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
 else
      BUILD_TYPE := release
 endif
 ALL_CCFLAGS :=
 ALL_CCFLAGS += $(NVCCFLAGS)
 ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
 SAMPLE_ENABLED := 1
 # This sample is not supported on QNX
 ifeq ($(TARGET_OS),qnx)
  $(info >>> WARNING - UnifiedMemoryStreams is not supported on QNX - waiving sample <<<)
  SAMPLE_ENABLED := 0
 endif
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 # Common includes and paths for CUDA
 INCLUDES  := -I../../../Common
 LIBRARIES :=
 ################################################################################
 # Attempt to compile a minimal OpenMP application. If a.out exists, OpenMP is properly set up.
 ifneq (,$(filter $(TARGET_OS),linux android))
 ifneq (,$(filter $(TARGET_OS), android))
     LIBRARIES += -lomp
 else
     LIBRARIES += -lgomp
 endif
 ALL_CCFLAGS += -Xcompiler -fopenmp
 $(shell echo "#include <omp.h>" > test.c ; echo "int main() { omp_get_num_threads(); return 0; }" >> test.c ; $(HOST_COMPILER) -fopenmp test.c)
 OPENMP ?= $(shell find a.out 2>/dev/null)
 ifeq ($(OPENMP),)
      $(info -----------------------------------------------------------------------------------------------)
      $(info WARNING - OpenMP is unable to compile)
      $(info -----------------------------------------------------------------------------------------------)
      $(info   This CUDA Sample cannot be built if the OpenMP compiler is not set up correctly.)
      $(info   This will be a dry-run of the Makefile.)
      $(info   For more information on how to set up your environment to build and run this )
      $(info   sample, please refer the CUDA Samples documentation and release notes)
      $(info -----------------------------------------------------------------------------------------------)
      SAMPLE_ENABLED := 0
 endif
 $(shell rm a.out test.c 2>/dev/null)
 else
 LIBRARIES += -lpthread
 ALL_CCFLAGS += -DUSE_PTHREADS
 endif
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
 SMS ?= 53 61 70 72 75 80 86 87
 else
 SMS ?= 35 37 50 52 60 61 70 75 80 86
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
 SAMPLE_ENABLED := 0
 endif
 ifeq ($(GENCODE_FLAGS),)
 # Generate SASS code for each SM architecture listed in $(SMS)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
 HIGHEST_SM := $(lastword $(sort $(SMS)))
 ifneq ($(HIGHEST_SM),)
 GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 ALL_CCFLAGS += --threads 0 --std=c++11
 LIBRARIES += -lcublas
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
 ################################################################################
 # Target rules
 all: build
 build: UnifiedMemoryStreams
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
 	@echo "Sample will be waived due to the above missing dependencies"
 else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 UnifiedMemoryStreams.o:UnifiedMemoryStreams.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 UnifiedMemoryStreams: UnifiedMemoryStreams.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 run: build
 	$(EXEC) ./UnifiedMemoryStreams
 testrun: build
 clean:
 	rm -f UnifiedMemoryStreams UnifiedMemoryStreams.o
 	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/UnifiedMemoryStreams
 clobber: clean
--- a/Samples/0_Introduction/UnifiedMemoryStreams/NsightEclipse.xml
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/NsightEclipse.xml
@ -1,101 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?> 
 <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 <entry>
  <name>UnifiedMemoryStreams</name>
  <cuda_api_list>
    <toolkit>cudaStreamDestroy</toolkit>
    <toolkit>cudaFree</toolkit>
    <toolkit>cudaMallocManaged</toolkit>
    <toolkit>cudaStreamCreate</toolkit>
    <toolkit>cudaDeviceSynchronize</toolkit>
    <toolkit>cudaStreamAttachMemAsync</toolkit>
    <toolkit>cudaSetDevice</toolkit>
    <toolkit>cudaStreamSynchronize</toolkit>
    <toolkit>cudaGetDeviceProperties</toolkit>
  </cuda_api_list>
  <description><![CDATA[This sample demonstrates the use of OpenMP and streams with Unified Memory on a single GPU.]]></description>
  <devicecompilation>whole</devicecompilation>
  <includepaths>
    <path>./</path>
    <path>../</path>
    <path>../../../Common</path>
  </includepaths>
  <keyconcepts>
    <concept level="basic">CUDA Systems Integration</concept>
    <concept level="basic">OpenMP</concept>
    <concept level="basic">CUBLAS</concept>
    <concept level="basic">Multithreading</concept>
    <concept level="basic">Unified Memory</concept>
    <concept level="basic">CUDA Streams and Events</concept>
  </keyconcepts>
  <keywords>
    <keyword>CUDA</keyword>
    <keyword>CUBLAS</keyword>
    <keyword>OpenMP</keyword>
    <keyword>cluster</keyword>
    <keyword>multi-GPU Support</keyword>
    <keyword>Unified Memory</keyword>
    <keyword>UVM</keyword>
    <keyword>openMP</keyword>
    <keyword>Streams</keyword>
    <keyword>pthreads</keyword>
  </keywords>
  <libraries>
    <library>cublas</library>
  </libraries>
  <librarypaths>
  </librarypaths>
  <nsight_eclipse>true</nsight_eclipse>
  <primary_file>UnifiedMemoryStreams.cu</primary_file>
  <required_dependencies>
    <dependency>OpenMP</dependency>
    <dependency>UVM</dependency>
    <dependency>CUBLAS</dependency>
  </required_dependencies>
  <scopes>
    <scope>1:CUDA Basic Topics</scope>
    <scope>1:CUDA Systems Integration</scope>
    <scope>1:Unified Memory</scope>
  </scopes>
  <sm-arch>sm35</sm-arch>
  <sm-arch>sm37</sm-arch>
  <sm-arch>sm50</sm-arch>
  <sm-arch>sm52</sm-arch>
  <sm-arch>sm53</sm-arch>
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <sm-arch>sm80</sm-arch>
  <sm-arch>sm86</sm-arch>
  <sm-arch>sm87</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
      <platform>linux</platform>
    </env>
    <env>
      <arch>x86_64</arch>
      <platform>macosx</platform>
    </env>
    <env>
      <platform>windows7</platform>
    </env>
    <env>
      <arch>arm</arch>
    </env>
    <env>
      <arch>sbsa</arch>
    </env>
    <env>
      <arch>ppc64le</arch>
      <platform>linux</platform>
    </env>
  </supported_envs>
  <supported_sm_architectures>
    <from>3.5</from>
  </supported_sm_architectures>
  <title>Unified Memory Streams</title>
  <type>exe</type>
 </entry>
--- a/Samples/0_Introduction/UnifiedMemoryStreams/README.md
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/README.md
@ -10,65 +10,25 @@ CUDA Systems Integration, OpenMP, CUBLAS, Multithreading, Unified Memory, CUDA S
 ## Supported SM Architectures
 [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
 Linux, Windows
 ## Supported CPU Architecture
-x86_64, ppc64le, armv7l
+x86_64, armv7l
 ## CUDA APIs involved
 ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
-cudaStreamDestroy, cudaFree, cudaMallocManaged, cudaStreamCreate, cudaDeviceSynchronize, cudaStreamAttachMemAsync, cudaSetDevice, cudaStreamSynchronize, cudaGetDeviceProperties
+cudaStreamDestroy, cudaFree, cudaMallocManaged, cudaStreamAttachMemAsync, cudaSetDevice, cudaDeviceSynchronize, cudaStreamSynchronize, cudaStreamCreate, cudaGetDeviceProperties
 ## Dependencies needed to build/run
-[OpenMP](../../README.md#openmp), [UVM](../../README.md#uvm), [CUBLAS](../../README.md#cublas)
+[OpenMP](../../../README.md#openmp), [UVM](../../../README.md#uvm), [CUBLAS](../../../README.md#cublas)
 ## Prerequisites
-Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## Build and Run
 ### Windows
 The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
 ```
 *_vs<version>.sln - for Visual Studio <version>
 ```
 Each individual sample has its own set of solution files in its directory:
 To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
 > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
 `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
    ```
    $ make SMS="50 60"
    ```
 *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
 ```
    $ make HOST_COMPILER=g++
 ```
 ## References (for more details)
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu
@ -31,10 +31,10 @@
 */
 // system includes
 #include <algorithm>
 #include <cstdio>
 #include <ctime>
 #include <vector>
 #include <algorithm>
 #ifdef USE_PTHREADS
 #include <pthread.h>
 #else
@ -51,291 +51,287 @@
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 // SRAND48 and DRAND48 don't exist on windows, but these are the equivalent
 // functions
-void srand48(long seed) { srand((unsigned int)seed); }
+void   srand48(long seed) { srand((unsigned int)seed); }
 double drand48() { return double(rand()) / RAND_MAX; }
 #endif
 const char *sSDKname = "UnifiedMemoryStreams";
 // simple task
-template <typename T>
+template <typename T> struct Task
-struct Task {
+{
-  unsigned int size, id;
+    unsigned int size, id;
-  T *data;
+    T           *data;
-  T *result;
+    T           *result;
-  T *vector;
+    T           *vector;
-  Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){};
+    Task()
-  Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) {
+        : size(0)
-    // allocate unified memory -- the operation performed in this example will
+        , id(0)
-    // be a DGEMV
+        , data(NULL)
-    checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
+        , result(NULL)
-    checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
+        , vector(NULL) {};
-    checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
+    Task(unsigned int s)
-    checkCudaErrors(cudaDeviceSynchronize());
+        : size(s)
-  }
+        , id(0)
-
+        , data(NULL)
-  ~Task() {
+        , result(NULL)
-    // ensure all memory is deallocated
+    {
-    checkCudaErrors(cudaDeviceSynchronize());
+        // allocate unified memory -- the operation performed in this example will
-    checkCudaErrors(cudaFree(data));
+        // be a DGEMV
-    checkCudaErrors(cudaFree(result));
+        checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
-    checkCudaErrors(cudaFree(vector));
+        checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
-  }
+        checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
-
+        checkCudaErrors(cudaDeviceSynchronize());
  void allocate(const unsigned int s, const unsigned int unique_id) {
    // allocate unified memory outside of constructor
    id = unique_id;
    size = s;
    checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
    checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
    checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
    checkCudaErrors(cudaDeviceSynchronize());
    // populate data with random elements
    for (unsigned int i = 0; i < size * size; i++) {
      data[i] = drand48();
    }
-    for (unsigned int i = 0; i < size; i++) {
+    ~Task()
-      result[i] = 0.;
+    {
-      vector[i] = drand48();
+        // ensure all memory is deallocated
        checkCudaErrors(cudaDeviceSynchronize());
        checkCudaErrors(cudaFree(data));
        checkCudaErrors(cudaFree(result));
        checkCudaErrors(cudaFree(vector));
    }
    void allocate(const unsigned int s, const unsigned int unique_id)
    {
        // allocate unified memory outside of constructor
        id   = unique_id;
        size = s;
        checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
        checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
        checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
        checkCudaErrors(cudaDeviceSynchronize());
        // populate data with random elements
        for (unsigned int i = 0; i < size * size; i++) {
            data[i] = drand48();
        }
        for (unsigned int i = 0; i < size; i++) {
            result[i] = 0.;
            vector[i] = drand48();
        }
    }
  }
 };
 #ifdef USE_PTHREADS
-struct threadData_t {
+struct threadData_t
-  int tid;
+{
-  Task<double> *TaskListPtr;
+    int             tid;
-  cudaStream_t *streams;
+    Task<double>   *TaskListPtr;
-  cublasHandle_t *handles;
+    cudaStream_t   *streams;
-  int taskSize;
+    cublasHandle_t *handles;
    int             taskSize;
 };
 typedef struct threadData_t threadData;
 #endif
 // simple host dgemv: assume data is in row-major format and square
-template <typename T>
+template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
-void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
+{
-  // rows
+    // rows
-  for (int i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
-    result[i] *= beta;
+        result[i] *= beta;
-    for (int j = 0; j < n; j++) {
+        for (int j = 0; j < n; j++) {
-      result[i] += A[i * n + j] * x[j];
+            result[i] += A[i * n + j] * x[j];
        }
    }
  }
 }
 // execute a single task on either host or device depending on size
 #ifdef USE_PTHREADS
-void *execute(void *inpArgs) {
+void *execute(void *inpArgs)
-  threadData *dataPtr = (threadData *)inpArgs;
+{
-  cudaStream_t *stream = dataPtr->streams;
+    threadData     *dataPtr = (threadData *)inpArgs;
-  cublasHandle_t *handle = dataPtr->handles;
+    cudaStream_t   *stream  = dataPtr->streams;
-  int tid = dataPtr->tid;
+    cublasHandle_t *handle  = dataPtr->handles;
    int             tid     = dataPtr->tid;
-  for (int i = 0; i < dataPtr->taskSize; i++) {
+    for (int i = 0; i < dataPtr->taskSize; i++) {
-    Task<double> &t = dataPtr->TaskListPtr[i];
+        Task<double> &t = dataPtr->TaskListPtr[i];
-    if (t.size < 100) {
+        if (t.size < 100) {
-      // perform on host
+            // perform on host
-      printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
+            printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
             t.size);
-      // attach managed memory to a (dummy) stream to allow host access while
+            // attach managed memory to a (dummy) stream to allow host access while
-      // the device is running
+            // the device is running
-      checkCudaErrors(
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
-          cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
-      checkCudaErrors(
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
-          cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
+            // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
-      checkCudaErrors(
+            checkCudaErrors(cudaStreamSynchronize(stream[0]));
-          cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
+            // call the host operation
-      // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
+            gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
-      checkCudaErrors(cudaStreamSynchronize(stream[0]));
+        }
-      // call the host operation
+        else {
-      gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
+            // perform on device
-    } else {
+            printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
-      // perform on device
+            double one  = 1.0;
-      printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
+            double zero = 0.0;
             t.size);
      double one = 1.0;
      double zero = 0.0;
-      // attach managed memory to my stream
+            // attach managed memory to my stream
-      checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
+            checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
-      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
-                                               cudaMemAttachSingle));
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
-      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
-                                               cudaMemAttachSingle));
+            // call the device operation
-      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
+            checkCudaErrors(cublasDgemv(
-                                               cudaMemAttachSingle));
+                handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
-      // call the device operation
+        }
      checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
                                  &one, t.data, t.size, t.vector, 1, &zero,
                                  t.result, 1));
    }
  }
-  pthread_exit(NULL);
+    pthread_exit(NULL);
 }
 #else
-template <typename T>
+template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
-void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream,
+{
-             int tid) {
+    if (t.size < 100) {
-  if (t.size < 100) {
+        // perform on host
-    // perform on host
+        printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
    printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
           t.size);
-    // attach managed memory to a (dummy) stream to allow host access while the
+        // attach managed memory to a (dummy) stream to allow host access while the
-    // device is running
+        // device is running
-    checkCudaErrors(
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
-        cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
-    checkCudaErrors(
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
-        cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
+        // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
-    checkCudaErrors(
+        checkCudaErrors(cudaStreamSynchronize(stream[0]));
-        cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
+        // call the host operation
-    // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
+        gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
-    checkCudaErrors(cudaStreamSynchronize(stream[0]));
+    }
-    // call the host operation
+    else {
-    gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
+        // perform on device
-  } else {
+        printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
-    // perform on device
+        double one  = 1.0;
-    printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
+        double zero = 0.0;
           t.size);
    double one = 1.0;
    double zero = 0.0;
-    // attach managed memory to my stream
+        // attach managed memory to my stream
-    checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
+        checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
-    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
-                                             cudaMemAttachSingle));
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
-    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
-                                             cudaMemAttachSingle));
+        // call the device operation
-    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
+        checkCudaErrors(cublasDgemv(
-                                             cudaMemAttachSingle));
+            handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
-    // call the device operation
+    }
    checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
                                &one, t.data, t.size, t.vector, 1, &zero,
                                t.result, 1));
  }
 }
 #endif
 // populate a list of tasks with random sizes
-template <typename T>
+template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
-void initialise_tasks(std::vector<Task<T> > &TaskList) {
+{
-  for (unsigned int i = 0; i < TaskList.size(); i++) {
+    for (unsigned int i = 0; i < TaskList.size(); i++) {
-    // generate random size
+        // generate random size
-    int size;
+        int size;
-    size = std::max((int)(drand48() * 1000.0), 64);
+        size = std::max((int)(drand48() * 1000.0), 64);
-    TaskList[i].allocate(size, i);
+        TaskList[i].allocate(size, i);
-  }
+    }
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  // set device
+{
-  cudaDeviceProp device_prop;
+    // set device
-  int dev_id = findCudaDevice(argc, (const char **)argv);
+    cudaDeviceProp device_prop;
-  checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
+    int            dev_id = findCudaDevice(argc, (const char **)argv);
    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
-  if (!device_prop.managedMemory) {
+    if (!device_prop.managedMemory) {
-    // This samples requires being run on a device that supports Unified Memory
+        // This samples requires being run on a device that supports Unified Memory
-    fprintf(stderr, "Unified Memory not supported on this device\n");
+        fprintf(stderr, "Unified Memory not supported on this device\n");
-    exit(EXIT_WAIVED);
+        exit(EXIT_WAIVED);
-  }
+    }
-  if (device_prop.computeMode == cudaComputeModeProhibited) {
+    if (device_prop.computeMode == cudaComputeModeProhibited) {
-    // This sample requires being run with a default or process exclusive mode
+        // This sample requires being run with a default or process exclusive mode
-    fprintf(stderr,
+        fprintf(stderr,
-            "This sample requires a device in either default or process "
+                "This sample requires a device in either default or process "
-            "exclusive mode\n");
+                "exclusive mode\n");
-    exit(EXIT_WAIVED);
+        exit(EXIT_WAIVED);
-  }
+    }
-  // randomise task sizes
+    // randomise task sizes
-  int seed = (int)time(NULL);
+    int seed = (int)time(NULL);
-  srand48(seed);
+    srand48(seed);
-  // set number of threads
+    // set number of threads
-  const int nthreads = 4;
+    const int nthreads = 4;
-  // number of streams = number of threads
+    // number of streams = number of threads
-  cudaStream_t *streams = new cudaStream_t[nthreads + 1];
+    cudaStream_t   *streams = new cudaStream_t[nthreads + 1];
-  cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
+    cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
-  for (int i = 0; i < nthreads + 1; i++) {
+    for (int i = 0; i < nthreads + 1; i++) {
-    checkCudaErrors(cudaStreamCreate(&streams[i]));
+        checkCudaErrors(cudaStreamCreate(&streams[i]));
-    checkCudaErrors(cublasCreate(&handles[i]));
+        checkCudaErrors(cublasCreate(&handles[i]));
-  }
+    }
-  // create list of N tasks
+    // create list of N tasks
-  unsigned int N = 40;
+    unsigned int              N = 40;
-  std::vector<Task<double> > TaskList(N);
+    std::vector<Task<double>> TaskList(N);
-  initialise_tasks(TaskList);
+    initialise_tasks(TaskList);
-  printf("Executing tasks on host / device\n");
+    printf("Executing tasks on host / device\n");
 // run through all tasks using threads and streams
 #ifdef USE_PTHREADS
-  pthread_t threads[nthreads];
+    pthread_t   threads[nthreads];
-  threadData *InputToThreads = new threadData[nthreads];
+    threadData *InputToThreads = new threadData[nthreads];
-  for (int i = 0; i < nthreads; i++) {
+    for (int i = 0; i < nthreads; i++) {
-    checkCudaErrors(cudaSetDevice(dev_id));
+        checkCudaErrors(cudaSetDevice(dev_id));
-    InputToThreads[i].tid = i;
+        InputToThreads[i].tid     = i;
-    InputToThreads[i].streams = streams;
+        InputToThreads[i].streams = streams;
-    InputToThreads[i].handles = handles;
+        InputToThreads[i].handles = handles;
-    if ((TaskList.size() / nthreads) == 0) {
+        if ((TaskList.size() / nthreads) == 0) {
-      InputToThreads[i].taskSize = (TaskList.size() / nthreads);
+            InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
-      InputToThreads[i].TaskListPtr =
+            InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
-          &TaskList[i * (TaskList.size() / nthreads)];
+        }
-    } else {
+        else {
-      if (i == nthreads - 1) {
+            if (i == nthreads - 1) {
-        InputToThreads[i].taskSize =
+                InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
-            (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
+                InputToThreads[i].TaskListPtr =
-        InputToThreads[i].TaskListPtr =
+                    &TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
-            &TaskList[i * (TaskList.size() / nthreads) +
+            }
-                      (TaskList.size() % nthreads)];
+            else {
-      } else {
+                InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
-        InputToThreads[i].taskSize = (TaskList.size() / nthreads);
+                InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
-        InputToThreads[i].TaskListPtr =
+            }
-            &TaskList[i * (TaskList.size() / nthreads)];
+        }
-      }
+
        pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
    }
    for (int i = 0; i < nthreads; i++) {
        pthread_join(threads[i], NULL);
    }
    pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
  }
  for (int i = 0; i < nthreads; i++) {
    pthread_join(threads[i], NULL);
  }
 #else
-  omp_set_num_threads(nthreads);
+    omp_set_num_threads(nthreads);
 #pragma omp parallel for schedule(dynamic)
-  for (int i = 0; i < TaskList.size(); i++) {
+    for (int i = 0; i < TaskList.size(); i++) {
-    checkCudaErrors(cudaSetDevice(dev_id));
+        checkCudaErrors(cudaSetDevice(dev_id));
-    int tid = omp_get_thread_num();
+        int tid = omp_get_thread_num();
-    execute(TaskList[i], handles, streams, tid);
+        execute(TaskList[i], handles, streams, tid);
-  }
+    }
 #endif
-  cudaDeviceSynchronize();
+    cudaDeviceSynchronize();
-  // Destroy CUDA Streams, cuBlas handles
+    // Destroy CUDA Streams, cuBlas handles
-  for (int i = 0; i < nthreads + 1; i++) {
+    for (int i = 0; i < nthreads + 1; i++) {
-    cudaStreamDestroy(streams[i]);
+        cudaStreamDestroy(streams[i]);
-    cublasDestroy(handles[i]);
+        cublasDestroy(handles[i]);
-  }
+    }
-  // Free TaskList
+    // Free TaskList
-  std::vector<Task<double> >().swap(TaskList);
+    std::vector<Task<double>>().swap(TaskList);
-  printf("All Done!\n");
+    printf("All Done!\n");
-  exit(EXIT_SUCCESS);
+    exit(EXIT_SUCCESS);
 }
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2017.sln
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2017.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2017
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryStreams", "UnifiedMemoryStreams_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2017.vcxproj
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2017.vcxproj
@ -1,113 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>UnifiedMemoryStreams_vs2017</RootNamespace>
    <ProjectName>UnifiedMemoryStreams</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/UnifiedMemoryStreams.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
      <AdditionalCompilerOptions>/openmp</AdditionalCompilerOptions>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="UnifiedMemoryStreams.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2019.sln
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2019.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2019
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryStreams", "UnifiedMemoryStreams_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2019.vcxproj
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2019.vcxproj
@ -1,109 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>UnifiedMemoryStreams_vs2019</RootNamespace>
    <ProjectName>UnifiedMemoryStreams</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v142</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/UnifiedMemoryStreams.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
      <AdditionalCompilerOptions>/openmp</AdditionalCompilerOptions>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="UnifiedMemoryStreams.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2022.sln
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2022.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2022
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryStreams", "UnifiedMemoryStreams_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2022.vcxproj
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2022.vcxproj
@ -1,109 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>UnifiedMemoryStreams_vs2022</RootNamespace>
    <ProjectName>UnifiedMemoryStreams</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v143</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/UnifiedMemoryStreams.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
      <AdditionalCompilerOptions>/openmp</AdditionalCompilerOptions>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="UnifiedMemoryStreams.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/asyncAPI/CMakeLists.txt
+++ b/Samples/0_Introduction/asyncAPI/CMakeLists.txt
@ -0,0 +1,30 @@
 cmake_minimum_required(VERSION 3.20)
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
 project(asyncAPI LANGUAGES C CXX CUDA)
 find_package(CUDAToolkit REQUIRED)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 if(ENABLE_CUDA_DEBUG)
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
 else()
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
 endif()
 # Include directories and libraries
 include_directories(../../../Common)
 # Source file
 # Add target for asyncAPI
 add_executable(asyncAPI asyncAPI.cu)
 target_compile_options(asyncAPI PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
 target_compile_features(asyncAPI PRIVATE cxx_std_17 cuda_std_17)
 set_target_properties(asyncAPI PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/Samples/0_Introduction/asyncAPI/Makefile
+++ b/Samples/0_Introduction/asyncAPI/Makefile
@ -1,341 +0,0 @@
 ################################################################################
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #  * Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 #  * Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #  * Neither the name of NVIDIA CORPORATION nor the names of its
 #    contributors may be used to endorse or promote products derived
 #    from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
 # Makefile project only supported on Mac OS X and Linux Platforms)
 #
 ################################################################################
 # Location of the CUDA Toolkit
 CUDA_PATH ?= /usr/local/cuda
 ##############################
 # start deprecated interface #
 ##############################
 ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
 endif
 ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
 endif
 ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
 endif
 ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
 endif
 ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
 endif
 ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
 endif
 ############################
 # end deprecated interface #
 ############################
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
 ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
 else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
 # sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
 ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
 endif
 # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
 endif
 # operating system
 HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 TARGET_OS ?= $(HOST_OS)
 ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 endif
 # host compiler
 ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
 else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
 endif
 HOST_COMPILER ?= g++
 NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 # internal flags
 NVCCFLAGS   := -m${TARGET_SIZE}
 CCFLAGS     :=
 LDFLAGS     :=
 # build flags
 ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
 else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
 else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif
        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
 endif
 ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
 endif
 # Install directory of different arch
 CUDA_INSTALL_TARGET_DIR :=
 ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
 else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
 endif
 # Debug build flags
 ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
 else
      BUILD_TYPE := release
 endif
 ALL_CCFLAGS :=
 ALL_CCFLAGS += $(NVCCFLAGS)
 ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 SAMPLE_ENABLED := 1
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 # Common includes and paths for CUDA
 INCLUDES  := -I../../../Common
 LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
 SMS ?= 53 61 70 72 75 80 86 87
 else
 SMS ?= 35 37 50 52 60 61 70 75 80 86
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
 SAMPLE_ENABLED := 0
 endif
 ifeq ($(GENCODE_FLAGS),)
 # Generate SASS code for each SM architecture listed in $(SMS)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
 HIGHEST_SM := $(lastword $(sort $(SMS)))
 ifneq ($(HIGHEST_SM),)
 GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 ALL_CCFLAGS += --threads 0 --std=c++11
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
 ################################################################################
 # Target rules
 all: build
 build: asyncAPI
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
 	@echo "Sample will be waived due to the above missing dependencies"
 else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 asyncAPI.o:asyncAPI.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 asyncAPI: asyncAPI.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 run: build
 	$(EXEC) ./asyncAPI
 testrun: build
 	$(EXEC) ./asyncAPI --dummy-test-param
 clean:
 	rm -f asyncAPI asyncAPI.o
 	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/asyncAPI
 clobber: clean
--- a/Samples/0_Introduction/asyncAPI/NsightEclipse.xml
+++ b/Samples/0_Introduction/asyncAPI/NsightEclipse.xml
@ -1,90 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?> 
 <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 <entry>
  <name>asyncAPI</name>
  <cuda_api_list>
    <toolkit>cudaMemset</toolkit>
    <toolkit>cudaFree</toolkit>
    <toolkit>cudaEventRecord</toolkit>
    <toolkit>cudaMallocHost</toolkit>
    <toolkit>cudaProfilerStart</toolkit>
    <toolkit>cudaEventCreate</toolkit>
    <toolkit>cudaEventElapsedTime</toolkit>
    <toolkit>cudaDeviceSynchronize</toolkit>
    <toolkit>cudaFreeHost</toolkit>
    <toolkit>cudaMalloc</toolkit>
    <toolkit>cudaEventQuery</toolkit>
    <toolkit>cudaProfilerStop</toolkit>
    <toolkit>cudaEventDestroy</toolkit>
    <toolkit>cudaMemcpyAsync</toolkit>
    <toolkit>cudaGetDeviceProperties</toolkit>
  </cuda_api_list>
  <description><![CDATA[This sample illustrates the usage of CUDA events for both GPU timing and overlapping CPU and GPU execution. Events are inserted into a stream of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can perform computations while GPU is executing (including DMA memcopies between the host and device). CPU can query CUDA events to determine whether GPU has completed tasks.]]></description>
  <devicecompilation>whole</devicecompilation>
  <includepaths>
    <path>./</path>
    <path>../</path>
    <path>../../../Common</path>
  </includepaths>
  <keyconcepts>
    <concept level="basic">Asynchronous Data Transfers</concept>
    <concept level="basic">CUDA Streams and Events</concept>
  </keyconcepts>
  <keywords>
    <keyword>GPGPU</keyword>
  </keywords>
  <libraries>
  </libraries>
  <librarypaths>
  </librarypaths>
  <nsight_eclipse>true</nsight_eclipse>
  <primary_file>asyncAPI.cu</primary_file>
  <qatests>
    <qatest>--dummy-test-param</qatest>
  </qatests>
  <scopes>
    <scope>1:CUDA Basic Topics</scope>
    <scope>1:Performance Strategies</scope>
  </scopes>
  <sm-arch>sm35</sm-arch>
  <sm-arch>sm37</sm-arch>
  <sm-arch>sm50</sm-arch>
  <sm-arch>sm52</sm-arch>
  <sm-arch>sm53</sm-arch>
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <sm-arch>sm80</sm-arch>
  <sm-arch>sm86</sm-arch>
  <sm-arch>sm87</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
      <platform>linux</platform>
    </env>
    <env>
      <platform>windows7</platform>
    </env>
    <env>
      <arch>x86_64</arch>
      <platform>macosx</platform>
    </env>
    <env>
      <arch>arm</arch>
    </env>
    <env>
      <arch>sbsa</arch>
    </env>
    <env>
      <arch>ppc64le</arch>
      <platform>linux</platform>
    </env>
  </supported_envs>
  <supported_sm_architectures>
    <include>all</include>
  </supported_sm_architectures>
  <title>asyncAPI</title>
  <type>exe</type>
 </entry>
--- a/Samples/0_Introduction/asyncAPI/README.md
+++ b/Samples/0_Introduction/asyncAPI/README.md
@ -10,7 +10,7 @@ Asynchronous Data Transfers, CUDA Streams and Events
 ## Supported SM Architectures
-[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)
+[SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.9 ](https://developer.nvidia.com/cuda-gpus)  [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
@ -18,53 +18,15 @@ Linux, Windows
 ## Supported CPU Architecture
-x86_64, ppc64le, armv7l
+x86_64, armv7l
 ## CUDA APIs involved
 ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
-cudaMemset, cudaFree, cudaEventRecord, cudaMallocHost, cudaProfilerStart, cudaEventCreate, cudaEventElapsedTime, cudaDeviceSynchronize, cudaFreeHost, cudaMalloc, cudaEventQuery, cudaProfilerStop, cudaEventDestroy, cudaMemcpyAsync, cudaGetDeviceProperties
+cudaProfilerStop, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaMallocHost, cudaProfilerStart, cudaDeviceSynchronize, cudaEventRecord, cudaFreeHost, cudaMemset, cudaEventDestroy, cudaEventQuery, cudaEventElapsedTime, cudaGetDeviceProperties, cudaEventCreate
 ## Prerequisites
-Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## Build and Run
 ### Windows
 The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
 ```
 *_vs<version>.sln - for Visual Studio <version>
 ```
 Each individual sample has its own set of solution files in its directory:
 To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
 > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
 `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
    ```
    $ make SMS="50 60"
    ```
 *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
 ```
    $ make HOST_COMPILER=g++
 ```
 ## References (for more details)
--- a/Samples/0_Introduction/asyncAPI/asyncAPI.cu
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI.cu
@ -38,105 +38,107 @@
 #include <stdio.h>
 // includes CUDA Runtime
 #include <cuda_runtime.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper utility functions
+#include <helper_functions.h> // helper utility functions
-__global__ void increment_kernel(int *g_data, int inc_value) {
+__global__ void increment_kernel(int *g_data, int inc_value)
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+{
-  g_data[idx] = g_data[idx] + inc_value;
+    int idx     = blockIdx.x * blockDim.x + threadIdx.x;
    g_data[idx] = g_data[idx] + inc_value;
 }
-bool correct_output(int *data, const int n, const int x) {
+bool correct_output(int *data, const int n, const int x)
-  for (int i = 0; i < n; i++)
+{
-    if (data[i] != x) {
+    for (int i = 0; i < n; i++)
-      printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
+        if (data[i] != x) {
-      return false;
+            printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
            return false;
        }
    return true;
 }
 int main(int argc, char *argv[])
 {
    int            devID;
    cudaDeviceProp deviceProps;
    printf("[%s] - Starting...\n", argv[0]);
    // This will pick the best possible CUDA capable device
    devID = findCudaDevice(argc, (const char **)argv);
    // get device name
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s]\n", deviceProps.name);
    int n      = 16 * 1024 * 1024;
    int nbytes = n * sizeof(int);
    int value  = 26;
    // allocate host memory
    int *a = 0;
    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
    memset(a, 0, nbytes);
    // allocate device memory
    int *d_a = 0;
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
    checkCudaErrors(cudaMemset(d_a, 255, nbytes));
    // set kernel launch configuration
    dim3 threads = dim3(512, 1);
    dim3 blocks  = dim3(n / threads.x, 1);
    // create cuda event handles
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));
    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);
    checkCudaErrors(cudaDeviceSynchronize());
    float gpu_time = 0.0f;
    // asynchronously issue work to the GPU (all to stream 0)
    checkCudaErrors(cudaProfilerStart());
    sdkStartTimer(&timer);
    cudaEventRecord(start, 0);
    cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
    increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
    cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
    cudaEventRecord(stop, 0);
    sdkStopTimer(&timer);
    checkCudaErrors(cudaProfilerStop());
    // have CPU do some work while waiting for stage 1 to finish
    unsigned long int counter = 0;
    while (cudaEventQuery(stop) == cudaErrorNotReady) {
        counter++;
    }
-  return true;
+    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
-}
+
-
+    // print the cpu and gpu times
-int main(int argc, char *argv[]) {
+    printf("time spent executing by the GPU: %.2f\n", gpu_time);
-  int devID;
+    printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
-  cudaDeviceProp deviceProps;
+    printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
-
+
-  printf("[%s] - Starting...\n", argv[0]);
+    // check the output for correctness
-
+    bool bFinalResults = correct_output(a, n, value);
-  // This will pick the best possible CUDA capable device
+
-  devID = findCudaDevice(argc, (const char **)argv);
+    // release resources
-
+    checkCudaErrors(cudaEventDestroy(start));
-  // get device name
+    checkCudaErrors(cudaEventDestroy(stop));
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    checkCudaErrors(cudaFreeHost(a));
-  printf("CUDA device [%s]\n", deviceProps.name);
+    checkCudaErrors(cudaFree(d_a));
-
+
-  int n = 16 * 1024 * 1024;
+    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
  int nbytes = n * sizeof(int);
  int value = 26;
  // allocate host memory
  int *a = 0;
  checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
  memset(a, 0, nbytes);
  // allocate device memory
  int *d_a = 0;
  checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
  checkCudaErrors(cudaMemset(d_a, 255, nbytes));
  // set kernel launch configuration
  dim3 threads = dim3(512, 1);
  dim3 blocks = dim3(n / threads.x, 1);
  // create cuda event handles
  cudaEvent_t start, stop;
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));
  StopWatchInterface *timer = NULL;
  sdkCreateTimer(&timer);
  sdkResetTimer(&timer);
  checkCudaErrors(cudaDeviceSynchronize());
  float gpu_time = 0.0f;
  // asynchronously issue work to the GPU (all to stream 0)
  checkCudaErrors(cudaProfilerStart());
  sdkStartTimer(&timer);
  cudaEventRecord(start, 0);
  cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
  increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
  cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
  cudaEventRecord(stop, 0);
  sdkStopTimer(&timer);
  checkCudaErrors(cudaProfilerStop());
  // have CPU do some work while waiting for stage 1 to finish
  unsigned long int counter = 0;
  while (cudaEventQuery(stop) == cudaErrorNotReady) {
    counter++;
  }
  checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
  // print the cpu and gpu times
  printf("time spent executing by the GPU: %.2f\n", gpu_time);
  printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
  printf("CPU executed %lu iterations while waiting for GPU to finish\n",
         counter);
  // check the output for correctness
  bool bFinalResults = correct_output(a, n, value);
  // release resources
  checkCudaErrors(cudaEventDestroy(start));
  checkCudaErrors(cudaEventDestroy(stop));
  checkCudaErrors(cudaFreeHost(a));
  checkCudaErrors(cudaFree(d_a));
  exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/asyncAPI/asyncAPI_vs2017.sln
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI_vs2017.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2017
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "asyncAPI", "asyncAPI_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/asyncAPI/asyncAPI_vs2017.vcxproj
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI_vs2017.vcxproj
@ -1,112 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>asyncAPI_vs2017</RootNamespace>
    <ProjectName>asyncAPI</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/asyncAPI.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="asyncAPI.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/asyncAPI/asyncAPI_vs2019.sln
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI_vs2019.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2019
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "asyncAPI", "asyncAPI_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/asyncAPI/asyncAPI_vs2019.vcxproj
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI_vs2019.vcxproj
@ -1,108 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>asyncAPI_vs2019</RootNamespace>
    <ProjectName>asyncAPI</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v142</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/asyncAPI.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="asyncAPI.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/asyncAPI/asyncAPI_vs2022.sln
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI_vs2022.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2022
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "asyncAPI", "asyncAPI_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/asyncAPI/asyncAPI_vs2022.vcxproj
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI_vs2022.vcxproj
@ -1,108 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>asyncAPI_vs2022</RootNamespace>
    <ProjectName>asyncAPI</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v143</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/asyncAPI.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="asyncAPI.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/c++11_cuda/Makefile
+++ b/Samples/0_Introduction/c++11_cuda/Makefile
@ -1,372 +0,0 @@
 ################################################################################
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #  * Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 #  * Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #  * Neither the name of NVIDIA CORPORATION nor the names of its
 #    contributors may be used to endorse or promote products derived
 #    from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
 # Makefile project only supported on Mac OS X and Linux Platforms)
 #
 ################################################################################
 # Location of the CUDA Toolkit
 CUDA_PATH ?= /usr/local/cuda
 ##############################
 # start deprecated interface #
 ##############################
 ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
 endif
 ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
 endif
 ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
 endif
 ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
 endif
 ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
 endif
 ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
 endif
 ############################
 # end deprecated interface #
 ############################
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
 ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
 else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
 # sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
 ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
 endif
 # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
 endif
 # operating system
 HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 TARGET_OS ?= $(HOST_OS)
 ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 endif
 # host compiler
 ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
 else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
 endif
 HOST_COMPILER ?= g++
 NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 # internal flags
 NVCCFLAGS   := -m${TARGET_SIZE}
 CCFLAGS     :=
 LDFLAGS     :=
 # build flags
 ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
 else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
 else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif
        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
 endif
 ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
 endif
 # Install directory of different arch
 CUDA_INSTALL_TARGET_DIR :=
 ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
 else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
 endif
 # Debug build flags
 ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
 else
      BUILD_TYPE := release
 endif
 ALL_CCFLAGS :=
 ALL_CCFLAGS += $(NVCCFLAGS)
 ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 SAMPLE_ENABLED := 1
 # This sample is not supported on QNX
 ifeq ($(TARGET_OS),qnx)
  $(info >>> WARNING - c++11_cuda is not supported on QNX - waiving sample <<<)
  SAMPLE_ENABLED := 0
 endif
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 # Common includes and paths for CUDA
 INCLUDES  := -I../../../Common
 LIBRARIES :=
 ################################################################################
 #Detect if installed version of GCC supports required C++11
 ifeq ($(TARGET_OS),linux)
    empty :=
    space := $(empty) $(empty)
    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
 #Create version number without "."
    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
 # Make sure the version number has at least 3 decimals
    GCCVERSION += 00
 # Remove spaces from the version number
    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
 #$(warning $(GCCVERSION))
    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)
    ifeq ($(IS_MIN_VERSION), 1)
        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
    else
        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
        SAMPLE_ENABLED := 0
    endif
 endif
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
 SMS ?= 53 61 70 72 75 80 86 87
 else
 SMS ?= 35 37 50 52 60 61 70 75 80 86
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
 SAMPLE_ENABLED := 0
 endif
 ifeq ($(GENCODE_FLAGS),)
 # Generate SASS code for each SM architecture listed in $(SMS)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
 HIGHEST_SM := $(lastword $(sort $(SMS)))
 ifneq ($(HIGHEST_SM),)
 GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 ALL_CCFLAGS += --std=c++11 --threads 0
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
 ################################################################################
 # Target rules
 all: build
 build: c++11_cuda
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
 	@echo "Sample will be waived due to the above missing dependencies"
 else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 c++11_cuda.o:c++11_cuda.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 c++11_cuda: c++11_cuda.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 run: build
 	$(EXEC) ./c++11_cuda
 testrun: build
 	$(EXEC) ./c++11_cuda --dummy-test-param
 clean:
 	rm -f c++11_cuda c++11_cuda.o
 	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/c++11_cuda
 clobber: clean
--- a/Samples/0_Introduction/c++11_cuda/NsightEclipse.xml
+++ b/Samples/0_Introduction/c++11_cuda/NsightEclipse.xml
@ -1,85 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?> 
 <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 <entry>
  <name>c++11_cuda</name>
  <cflags>
    <flag>--std=c++11</flag>
  </cflags>
  <cuda_api_list>
    <toolkit>cudaMalloc</toolkit>
    <toolkit>cudaMemset</toolkit>
    <toolkit>cudaFree</toolkit>
    <toolkit>cudaMemcpy</toolkit>
  </cuda_api_list>
  <description><![CDATA[This sample demonstrates C++11 feature support in CUDA. It scans a input text file and prints no. of occurrences of x, y, z, w characters. ]]></description>
  <devicecompilation>whole</devicecompilation>
  <includepaths>
    <path>./</path>
    <path>../</path>
    <path>../../../Common</path>
  </includepaths>
  <keyconcepts>
    <concept level="advanced">CPP11 CUDA</concept>
  </keyconcepts>
  <keywords>
    <keyword>GPGPU</keyword>
    <keyword>CPP11</keyword>
  </keywords>
  <libraries>
  </libraries>
  <librarypaths>
  </librarypaths>
  <nsight_eclipse>true</nsight_eclipse>
  <primary_file>c++11_cuda.cu</primary_file>
  <qatests>
    <qatest>--dummy-test-param</qatest>
  </qatests>
  <required_dependencies>
    <dependency>CPP11</dependency>
  </required_dependencies>
  <scopes>
    <scope>1:CUDA Advanced Topics</scope>
    <scope>1:C++11 CUDA</scope>
  </scopes>
  <sm-arch>sm35</sm-arch>
  <sm-arch>sm37</sm-arch>
  <sm-arch>sm50</sm-arch>
  <sm-arch>sm52</sm-arch>
  <sm-arch>sm53</sm-arch>
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <sm-arch>sm80</sm-arch>
  <sm-arch>sm86</sm-arch>
  <sm-arch>sm87</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
      <platform>linux</platform>
    </env>
    <env>
      <arch>x86_64</arch>
      <platform>macosx</platform>
    </env>
    <env>
      <arch>arm</arch>
    </env>
    <env>
      <arch>sbsa</arch>
    </env>
    <env>
      <arch>ppc64le</arch>
      <platform>linux</platform>
    </env>
    <env>
      <platform>windows7</platform>
    </env>
  </supported_envs>
  <supported_sm_architectures>
    <include>all</include>
  </supported_sm_architectures>
  <title>C++11 CUDA</title>
  <type>exe</type>
 </entry>
--- a/Samples/0_Introduction/c++11_cuda/README.md
+++ b/Samples/0_Introduction/c++11_cuda/README.md
@ -1,74 +0,0 @@
 # c++11_cuda - C++11 CUDA
 ## Description
 This sample demonstrates C++11 feature support in CUDA. It scans a input text file and prints no. of occurrences of x, y, z, w characters.
 ## Key Concepts
 CPP11 CUDA
 ## Supported SM Architectures
 [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
 Linux, Windows
 ## Supported CPU Architecture
 x86_64, ppc64le, armv7l
 ## CUDA APIs involved
 ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
 cudaMalloc, cudaMemset, cudaFree, cudaMemcpy
 ## Dependencies needed to build/run
 [CPP11](../../README.md#cpp11)
 ## Prerequisites
 Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## Build and Run
 ### Windows
 The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
 ```
 *_vs<version>.sln - for Visual Studio <version>
 ```
 Each individual sample has its own set of solution files in its directory:
 To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
 > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
 `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
    ```
    $ make SMS="50 60"
    ```
 *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
 ```
    $ make HOST_COMPILER=g++
 ```
 ## References (for more details)
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda.cu
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda.cu
@ -1,140 +0,0 @@
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <thrust/device_ptr.h>
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
 #include <iostream>
 #include <helper_cuda.h>
 /////////////////////////////////////////////////////////////////
 // Some utility code to define grid_stride_range
 // Normally this would be in a header but it's here
 // for didactic purposes. Uses
 #include "range.hpp"
 using namespace util::lang;
 // type alias to simplify typing...
 template <typename T>
 using step_range = typename range_proxy<T>::step_range_proxy;
 template <typename T>
 __device__ step_range<T> grid_stride_range(T begin, T end) {
  begin += blockDim.x * blockIdx.x + threadIdx.x;
  return range(begin, end).step(gridDim.x * blockDim.x);
 }
 /////////////////////////////////////////////////////////////////
 template <typename T, typename Predicate>
 __device__ void count_if(int *count, T *data, int n, Predicate p) {
  for (auto i : grid_stride_range(0, n)) {
    if (p(data[i])) atomicAdd(count, 1);
  }
 }
 // Use count_if with a lambda function that searches for x, y, z or w
 // Note the use of range-based for loop and initializer_list inside the functor
 // We use auto so we don't have to know the type of the functor or array
 __global__ void xyzw_frequency(int *count, char *text, int n) {
  const char letters[]{'x', 'y', 'z', 'w'};
  count_if(count, text, n, [&](char c) {
    for (const auto x : letters)
      if (c == x) return true;
    return false;
  });
 }
 __global__ void xyzw_frequency_thrust_device(int *count, char *text, int n) {
  const char letters[]{'x', 'y', 'z', 'w'};
  *count = thrust::count_if(thrust::device, text, text + n, [=](char c) {
    for (const auto x : letters)
      if (c == x) return true;
    return false;
  });
 }
 // a bug in Thrust 1.8 causes warnings when this is uncommented
 // so commented out by default -- fixed in Thrust master branch
 #if 0 
 void xyzw_frequency_thrust_host(int *count, char *text, int n)
 {
  const char letters[] { 'x','y','z','w' };
  *count = thrust::count_if(thrust::host, text, text+n, [&](char c) {
    for (const auto x : letters) 
      if (c == x) return true;
    return false;
  });
 }
 #endif
 int main(int argc, char **argv) {
  const char *filename = sdkFindFilePath("warandpeace.txt", argv[0]);
  int numBytes = 16 * 1048576;
  char *h_text = (char *)malloc(numBytes);
  // find first CUDA device
  int devID = findCudaDevice(argc, (const char **)argv);
  char *d_text;
  checkCudaErrors(cudaMalloc((void **)&d_text, numBytes));
  FILE *fp = fopen(filename, "r");
  if (fp == NULL) {
    printf("Cannot find the input text file\n. Exiting..\n");
    return EXIT_FAILURE;
  }
  int len = (int)fread(h_text, sizeof(char), numBytes, fp);
  fclose(fp);
  std::cout << "Read " << len << " byte corpus from " << filename << std::endl;
  checkCudaErrors(cudaMemcpy(d_text, h_text, len, cudaMemcpyHostToDevice));
  int count = 0;
  int *d_count;
  checkCudaErrors(cudaMalloc(&d_count, sizeof(int)));
  checkCudaErrors(cudaMemset(d_count, 0, sizeof(int)));
  // Try uncommenting one kernel call at a time
  xyzw_frequency<<<8, 256>>>(d_count, d_text, len);
  xyzw_frequency_thrust_device<<<1, 1>>>(d_count, d_text, len);
  checkCudaErrors(
      cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDeviceToHost));
  // xyzw_frequency_thrust_host(&count, h_text, len);
  std::cout << "counted " << count
            << " instances of 'x', 'y', 'z', or 'w' in \"" << filename << "\""
            << std::endl;
  checkCudaErrors(cudaFree(d_count));
  checkCudaErrors(cudaFree(d_text));
  return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2017.sln
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2017.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2017
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "c++11_cuda", "c++11_cuda_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2017.vcxproj
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2017.vcxproj
@ -1,112 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>c++11_cuda_vs2017</RootNamespace>
    <ProjectName>c++11_cuda</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/c++11_cuda.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="c++11_cuda.cu" />
    <ClInclude Include="range.hpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2019.sln
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2019.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2019
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "c++11_cuda", "c++11_cuda_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2019.vcxproj
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2019.vcxproj
@ -1,108 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>c++11_cuda_vs2019</RootNamespace>
    <ProjectName>c++11_cuda</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v142</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/c++11_cuda.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="c++11_cuda.cu" />
    <ClInclude Include="range.hpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2022.sln
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2022.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2022
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "c++11_cuda", "c++11_cuda_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2022.vcxproj
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2022.vcxproj
@ -1,108 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>c++11_cuda_vs2022</RootNamespace>
    <ProjectName>c++11_cuda</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v143</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/c++11_cuda.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="c++11_cuda.cu" />
    <ClInclude Include="range.hpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/c++11_cuda/range.hpp
+++ b/Samples/0_Introduction/c++11_cuda/range.hpp
@ -1,279 +0,0 @@
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef UTIL_LANG_RANGE_HPP
 #define UTIL_LANG_RANGE_HPP
 #include <iterator>
 #include <type_traits>
 // Make these ranges usable inside CUDA C++ device code
 #ifdef __CUDACC__
 #define DEVICE_CALLABLE __host__ __device__
 #else
 #define DEVICE_CALLABLE
 #endif
 namespace util {
 namespace lang {
 namespace detail {
 template <typename T>
 struct range_iter_base : std::iterator<std::input_iterator_tag, T> {
  DEVICE_CALLABLE
  range_iter_base(T current) : current(current) {}
  DEVICE_CALLABLE
  T operator*() const { return current; }
  DEVICE_CALLABLE
  T const* operator->() const { return &current; }
  DEVICE_CALLABLE
  range_iter_base& operator++() {
    ++current;
    return *this;
  }
  DEVICE_CALLABLE
  range_iter_base operator++(int) {
    auto copy = *this;
    ++*this;
    return copy;
  }
  DEVICE_CALLABLE
  bool operator==(range_iter_base const& other) const {
    return current == other.current;
  }
  DEVICE_CALLABLE
  bool operator!=(range_iter_base const& other) const {
    return not(*this == other);
  }
 protected:
  T current;
 };
 }  // namespace detail
 template <typename T>
 struct range_proxy {
  struct iter : detail::range_iter_base<T> {
    DEVICE_CALLABLE
    iter(T current) : detail::range_iter_base<T>(current) {}
  };
  struct step_range_proxy {
    struct iter : detail::range_iter_base<T> {
      DEVICE_CALLABLE
      iter(T current, T step)
          : detail::range_iter_base<T>(current), step(step) {}
      using detail::range_iter_base<T>::current;
      DEVICE_CALLABLE
      iter& operator++() {
        current += step;
        return *this;
      }
      DEVICE_CALLABLE
      iter operator++(int) {
        auto copy = *this;
        ++*this;
        return copy;
      }
      // Loses commutativity. Iterator-based ranges are simply broken. :-(
      DEVICE_CALLABLE
      bool operator==(iter const& other) const {
        return step > 0 ? current >= other.current : current < other.current;
      }
      DEVICE_CALLABLE
      bool operator!=(iter const& other) const { return !(*this == other); }
     private:
      T step;
    };
    DEVICE_CALLABLE
    step_range_proxy(T begin, T end, T step)
        : begin_(begin, step), end_(end, step) {}
    DEVICE_CALLABLE
    iter begin() const { return begin_; }
    DEVICE_CALLABLE
    iter end() const { return end_; }
   private:
    iter begin_;
    iter end_;
  };
  DEVICE_CALLABLE
  range_proxy(T begin, T end) : begin_(begin), end_(end) {}
  DEVICE_CALLABLE
  step_range_proxy step(T step) { return {*begin_, *end_, step}; }
  DEVICE_CALLABLE
  iter begin() const { return begin_; }
  DEVICE_CALLABLE
  iter end() const { return end_; }
 private:
  iter begin_;
  iter end_;
 };
 template <typename T>
 struct infinite_range_proxy {
  struct iter : detail::range_iter_base<T> {
    DEVICE_CALLABLE
    iter(T current = T()) : detail::range_iter_base<T>(current) {}
    DEVICE_CALLABLE
    bool operator==(iter const&) const { return false; }
    DEVICE_CALLABLE
    bool operator!=(iter const&) const { return true; }
  };
  struct step_range_proxy {
    struct iter : detail::range_iter_base<T> {
      DEVICE_CALLABLE
      iter(T current = T(), T step = T())
          : detail::range_iter_base<T>(current), step(step) {}
      using detail::range_iter_base<T>::current;
      DEVICE_CALLABLE
      iter& operator++() {
        current += step;
        return *this;
      }
      DEVICE_CALLABLE
      iter operator++(int) {
        auto copy = *this;
        ++*this;
        return copy;
      }
      DEVICE_CALLABLE
      bool operator==(iter const&) const { return false; }
      DEVICE_CALLABLE
      bool operator!=(iter const&) const { return true; }
     private:
      T step;
    };
    DEVICE_CALLABLE
    step_range_proxy(T begin, T step) : begin_(begin, step) {}
    DEVICE_CALLABLE
    iter begin() const { return begin_; }
    DEVICE_CALLABLE
    iter end() const { return iter(); }
   private:
    iter begin_;
  };
  DEVICE_CALLABLE
  infinite_range_proxy(T begin) : begin_(begin) {}
  DEVICE_CALLABLE
  step_range_proxy step(T step) { return step_range_proxy(*begin_, step); }
  DEVICE_CALLABLE
  iter begin() const { return begin_; }
  DEVICE_CALLABLE
  iter end() const { return iter(); }
 private:
  iter begin_;
 };
 template <typename T>
 DEVICE_CALLABLE range_proxy<T> range(T begin, T end) {
  return {begin, end};
 }
 template <typename T>
 DEVICE_CALLABLE infinite_range_proxy<T> range(T begin) {
  return {begin};
 }
 namespace traits {
 template <typename C>
 struct has_size {
  template <typename T>
  static constexpr auto check(T*) ->
      typename std::is_integral<decltype(std::declval<T const>().size())>::type;
  template <typename>
  static constexpr auto check(...) -> std::false_type;
  using type = decltype(check<C>(0));
  static constexpr bool value = type::value;
 };
 }  // namespace traits
 template <typename C,
          typename = typename std::enable_if<traits::has_size<C>::value>>
 DEVICE_CALLABLE auto indices(C const& cont)
    -> range_proxy<decltype(cont.size())> {
  return {0, cont.size()};
 }
 template <typename T, std::size_t N>
 DEVICE_CALLABLE range_proxy<std::size_t> indices(T(&)[N]) {
  return {0, N};
 }
 template <typename T>
 range_proxy<typename std::initializer_list<T>::size_type> DEVICE_CALLABLE
 indices(std::initializer_list<T>&& cont) {
  return {0, cont.size()};
 }
 }
 }  // namespace util::lang
 #endif  // ndef UTIL_LANG_RANGE_HPP
--- a/Samples/0_Introduction/c++11_cuda/warandpeace.txt
+++ b/Samples/0_Introduction/c++11_cuda/warandpeace.txt
--- a/Samples/0_Introduction/clock/CMakeLists.txt
+++ b/Samples/0_Introduction/clock/CMakeLists.txt
@ -0,0 +1,30 @@
 cmake_minimum_required(VERSION 3.20)
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
 project(clock LANGUAGES C CXX CUDA)
 find_package(CUDAToolkit REQUIRED)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 if(ENABLE_CUDA_DEBUG)
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
 else()
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
 endif()
 # Include directories and libraries
 include_directories(../../../Common)
 # Source file
 # Add target for asyncAPI
 add_executable(clock clock.cu)
 target_compile_options(clock PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
 target_compile_features(clock PRIVATE cxx_std_17 cuda_std_17)
 set_target_properties(clock PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
--- a/Samples/0_Introduction/clock/Makefile
+++ b/Samples/0_Introduction/clock/Makefile
@ -1,340 +0,0 @@
 ################################################################################
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #  * Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 #  * Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #  * Neither the name of NVIDIA CORPORATION nor the names of its
 #    contributors may be used to endorse or promote products derived
 #    from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
 # Makefile project only supported on Mac OS X and Linux Platforms)
 #
 ################################################################################
 # Location of the CUDA Toolkit
 CUDA_PATH ?= /usr/local/cuda
 ##############################
 # start deprecated interface #
 ##############################
 ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
 endif
 ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
 endif
 ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
 endif
 ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
 endif
 ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
 endif
 ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
 endif
 ############################
 # end deprecated interface #
 ############################
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
 ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
 else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
 # sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
 ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
 endif
 # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
 endif
 # operating system
 HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 TARGET_OS ?= $(HOST_OS)
 ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 endif
 # host compiler
 ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
 else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
 endif
 HOST_COMPILER ?= g++
 NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 # internal flags
 NVCCFLAGS   := -m${TARGET_SIZE}
 CCFLAGS     :=
 LDFLAGS     :=
 # build flags
 ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
 else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
 else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif
        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
 endif
 ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
 endif
 # Install directory of different arch
 CUDA_INSTALL_TARGET_DIR :=
 ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
 else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
 endif
 # Debug build flags
 ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
 else
      BUILD_TYPE := release
 endif
 ALL_CCFLAGS :=
 ALL_CCFLAGS += $(NVCCFLAGS)
 ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 SAMPLE_ENABLED := 1
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 # Common includes and paths for CUDA
 INCLUDES  := -I../../../Common
 LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
 SMS ?= 53 61 70 72 75 80 86 87
 else
 SMS ?= 35 37 50 52 60 61 70 75 80 86
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
 SAMPLE_ENABLED := 0
 endif
 ifeq ($(GENCODE_FLAGS),)
 # Generate SASS code for each SM architecture listed in $(SMS)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
 HIGHEST_SM := $(lastword $(sort $(SMS)))
 ifneq ($(HIGHEST_SM),)
 GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 ALL_CCFLAGS += --threads 0 --std=c++11
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
 ################################################################################
 # Target rules
 all: build
 build: clock
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
 	@echo "Sample will be waived due to the above missing dependencies"
 else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 clock.o:clock.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 clock: clock.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 run: build
 	$(EXEC) ./clock
 testrun: build
 clean:
 	rm -f clock clock.o
 	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/clock
 clobber: clean
--- a/Samples/0_Introduction/clock/NsightEclipse.xml
+++ b/Samples/0_Introduction/clock/NsightEclipse.xml
@ -1,78 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?> 
 <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 <entry>
  <name>clock</name>
  <cuda_api_list>
    <toolkit>cudaMalloc</toolkit>
    <toolkit>cudaFree</toolkit>
    <toolkit>cudaMemcpy</toolkit>
  </cuda_api_list>
  <description><![CDATA[This example shows how to use the clock function to measure the performance of block of threads of a kernel accurately.]]></description>
  <devicecompilation>whole</devicecompilation>
  <includepaths>
    <path>./</path>
    <path>../</path>
    <path>../../../Common</path>
  </includepaths>
  <keyconcepts>
    <concept level="basic">Performance Strategies</concept>
  </keyconcepts>
  <keywords>
    <keyword>performance</keyword>
    <keyword>timing</keyword>
    <keyword>CUDA</keyword>
    <keyword>clock</keyword>
    <keyword>timer</keyword>
  </keywords>
  <libraries>
  </libraries>
  <librarypaths>
  </librarypaths>
  <nsight_eclipse>true</nsight_eclipse>
  <primary_file>clock.cu</primary_file>
  <scopes>
    <scope>1:CUDA Basic Topics</scope>
    <scope>1:Performance Strategies</scope>
  </scopes>
  <sm-arch>sm35</sm-arch>
  <sm-arch>sm37</sm-arch>
  <sm-arch>sm50</sm-arch>
  <sm-arch>sm52</sm-arch>
  <sm-arch>sm53</sm-arch>
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <sm-arch>sm80</sm-arch>
  <sm-arch>sm86</sm-arch>
  <sm-arch>sm87</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
      <platform>linux</platform>
    </env>
    <env>
      <platform>windows7</platform>
    </env>
    <env>
      <arch>x86_64</arch>
      <platform>macosx</platform>
    </env>
    <env>
      <arch>arm</arch>
    </env>
    <env>
      <arch>sbsa</arch>
    </env>
    <env>
      <arch>ppc64le</arch>
      <platform>linux</platform>
    </env>
  </supported_envs>
  <supported_sm_architectures>
    <include>all</include>
  </supported_sm_architectures>
  <title>Clock</title>
  <type>exe</type>
 </entry>
--- a/Samples/0_Introduction/clock/README.md
+++ b/Samples/0_Introduction/clock/README.md
@ -10,7 +10,7 @@ Performance Strategies
 ## Supported SM Architectures
-[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)
+[SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.9 ](https://developer.nvidia.com/cuda-gpus)  [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
@ -18,53 +18,15 @@ Linux, Windows
 ## Supported CPU Architecture
-x86_64, ppc64le, armv7l
+x86_64, armv7l
 ## CUDA APIs involved
 ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
-cudaMalloc, cudaFree, cudaMemcpy
+cudaMalloc, cudaMemcpy, cudaFree
 ## Prerequisites
-Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## Build and Run
 ### Windows
 The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
 ```
 *_vs<version>.sln - for Visual Studio <version>
 ```
 Each individual sample has its own set of solution files in its directory:
 To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
 > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
 `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
    ```
    $ make SMS="50 60"
    ```
 *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
 ```
    $ make HOST_COMPILER=g++
 ```
 ## References (for more details)
--- a/Samples/0_Introduction/clock/clock.cu
+++ b/Samples/0_Introduction/clock/clock.cu
@ -48,43 +48,46 @@
 // This kernel computes a standard parallel reduction and evaluates the
 // time it takes to do that for each block. The timing results are stored
 // in device memory.
-__global__ static void timedReduction(const float *input, float *output,
+__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
-                                      clock_t *timer) {
+{
-  // __shared__ float shared[2 * blockDim.x];
+    // __shared__ float shared[2 * blockDim.x];
-  extern __shared__ float shared[];
+    extern __shared__ float shared[];
-  const int tid = threadIdx.x;
+    const int tid = threadIdx.x;
-  const int bid = blockIdx.x;
+    const int bid = blockIdx.x;
-  if (tid == 0) timer[bid] = clock();
+    if (tid == 0)
        timer[bid] = clock();
-  // Copy input.
+    // Copy input.
-  shared[tid] = input[tid];
+    shared[tid]              = input[tid];
-  shared[tid + blockDim.x] = input[tid + blockDim.x];
+    shared[tid + blockDim.x] = input[tid + blockDim.x];
    // Perform reduction to find minimum.
    for (int d = blockDim.x; d > 0; d /= 2) {
        __syncthreads();
        if (tid < d) {
            float f0 = shared[tid];
            float f1 = shared[tid + d];
            if (f1 < f0) {
                shared[tid] = f1;
            }
        }
    }
    // Write result.
    if (tid == 0)
        output[bid] = shared[0];
  // Perform reduction to find minimum.
  for (int d = blockDim.x; d > 0; d /= 2) {
    __syncthreads();
-    if (tid < d) {
+    if (tid == 0)
-      float f0 = shared[tid];
+        timer[bid + gridDim.x] = clock();
      float f1 = shared[tid + d];
      if (f1 < f0) {
        shared[tid] = f1;
      }
    }
  }
  // Write result.
  if (tid == 0) output[bid] = shared[0];
  __syncthreads();
  if (tid == 0) timer[bid + gridDim.x] = clock();
 }
-#define NUM_BLOCKS 64
+#define NUM_BLOCKS  64
 #define NUM_THREADS 256
 // It's interesting to change the number of blocks and the number of threads to
@ -104,50 +107,46 @@ __global__ static void timedReduction(const float *input, float *output,
 // the memory. With more than 32 the speed scales linearly.
 // Start the main CUDA Sample here
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("CUDA Clock sample\n");
+{
    printf("CUDA Clock sample\n");
-  // This will pick the best possible CUDA capable device
+    // This will pick the best possible CUDA capable device
-  int dev = findCudaDevice(argc, (const char **)argv);
+    int dev = findCudaDevice(argc, (const char **)argv);
-  float *dinput = NULL;
+    float   *dinput  = NULL;
-  float *doutput = NULL;
+    float   *doutput = NULL;
-  clock_t *dtimer = NULL;
+    clock_t *dtimer  = NULL;
-  clock_t timer[NUM_BLOCKS * 2];
+    clock_t timer[NUM_BLOCKS * 2];
-  float input[NUM_THREADS * 2];
+    float   input[NUM_THREADS * 2];
-  for (int i = 0; i < NUM_THREADS * 2; i++) {
+    for (int i = 0; i < NUM_THREADS * 2; i++) {
-    input[i] = (float)i;
+        input[i] = (float)i;
-  }
+    }
-  checkCudaErrors(
+    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
-      cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
+    checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
-  checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
+    checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
  checkCudaErrors(
      cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
-  checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2,
+    checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
                             cudaMemcpyHostToDevice));
-  timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(
+    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
      dinput, doutput, dtimer);
-  checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2,
+    checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
                             cudaMemcpyDeviceToHost));
-  checkCudaErrors(cudaFree(dinput));
+    checkCudaErrors(cudaFree(dinput));
-  checkCudaErrors(cudaFree(doutput));
+    checkCudaErrors(cudaFree(doutput));
-  checkCudaErrors(cudaFree(dtimer));
+    checkCudaErrors(cudaFree(dtimer));
-  long double avgElapsedClocks = 0;
+    long double avgElapsedClocks = 0;
-  for (int i = 0; i < NUM_BLOCKS; i++) {
+    for (int i = 0; i < NUM_BLOCKS; i++) {
-    avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
+        avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
-  }
+    }
-  avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
+    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
-  printf("Average clocks/block = %Lf\n", avgElapsedClocks);
+    printf("Average clocks/block = %Lf\n", avgElapsedClocks);
-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/clock/clock_vs2017.sln
+++ b/Samples/0_Introduction/clock/clock_vs2017.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2017
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock", "clock_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/clock/clock_vs2017.vcxproj
+++ b/Samples/0_Introduction/clock/clock_vs2017.vcxproj
@ -1,112 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>clock_vs2017</RootNamespace>
    <ProjectName>clock</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/clock.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="clock.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/clock/clock_vs2019.sln
+++ b/Samples/0_Introduction/clock/clock_vs2019.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2019
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock", "clock_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/clock/clock_vs2019.vcxproj
+++ b/Samples/0_Introduction/clock/clock_vs2019.vcxproj
@ -1,108 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>clock_vs2019</RootNamespace>
    <ProjectName>clock</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v142</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/clock.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="clock.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/clock/clock_vs2022.sln
+++ b/Samples/0_Introduction/clock/clock_vs2022.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2022
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock", "clock_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/clock/clock_vs2022.vcxproj
+++ b/Samples/0_Introduction/clock/clock_vs2022.vcxproj
@ -1,108 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>clock_vs2022</RootNamespace>
    <ProjectName>clock</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v143</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/clock.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="clock.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/clock_nvrtc/CMakeLists.txt
+++ b/Samples/0_Introduction/clock_nvrtc/CMakeLists.txt
@ -0,0 +1,39 @@
 cmake_minimum_required(VERSION 3.20)
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
 project(clock_nvrtc LANGUAGES C CXX CUDA)
 find_package(CUDAToolkit REQUIRED)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 if(ENABLE_CUDA_DEBUG)
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
 else()
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
 endif()
 # Include directories and libraries
 include_directories(../../../Common)
 # Source file
 # Add sample target executable
 add_executable(clock_nvrtc clock.cpp)
 target_compile_options(clock_nvrtc PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
 target_compile_features(clock_nvrtc PRIVATE cxx_std_17 cuda_std_17)
 target_link_libraries(clock_nvrtc PRIVATE
    CUDA::nvrtc
    CUDA::cuda_driver
 )
 # Copy clock_kernel.cu to the output directory
 add_custom_command(TARGET clock_nvrtc POST_BUILD
    COMMAND ${CMAKE_COMMAND} -E copy_if_different
    ${CMAKE_CURRENT_SOURCE_DIR}/clock_kernel.cu ${CMAKE_CURRENT_BINARY_DIR}
 )
--- a/Samples/0_Introduction/clock_nvrtc/Makefile
+++ b/Samples/0_Introduction/clock_nvrtc/Makefile
@ -1,392 +0,0 @@
 ################################################################################
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #  * Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 #  * Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #  * Neither the name of NVIDIA CORPORATION nor the names of its
 #    contributors may be used to endorse or promote products derived
 #    from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
 # Makefile project only supported on Mac OS X and Linux Platforms)
 #
 ################################################################################
 # Location of the CUDA Toolkit
 CUDA_PATH ?= /usr/local/cuda
 ##############################
 # start deprecated interface #
 ##############################
 ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
 endif
 ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
 endif
 ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
 endif
 ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
 endif
 ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
 endif
 ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
 endif
 ############################
 # end deprecated interface #
 ############################
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
 ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
 else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
 # sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
 ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
 endif
 # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
 endif
 # operating system
 HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 TARGET_OS ?= $(HOST_OS)
 ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 endif
 # host compiler
 ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
 else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
 endif
 HOST_COMPILER ?= g++
 NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 # internal flags
 NVCCFLAGS   := -m${TARGET_SIZE}
 CCFLAGS     :=
 LDFLAGS     :=
 # build flags
 ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
 else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
 else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif
        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
 endif
 ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
 endif
 # Install directory of different arch
 CUDA_INSTALL_TARGET_DIR :=
 ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
 else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
 endif
 # Debug build flags
 ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
 else
      BUILD_TYPE := release
 endif
 ALL_CCFLAGS :=
 ALL_CCFLAGS += $(NVCCFLAGS)
 ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
 SAMPLE_ENABLED := 1
 # This sample is not supported on ARMv7
 ifeq ($(TARGET_ARCH),armv7l)
  $(info >>> WARNING - clock_nvrtc is not supported on ARMv7 - waiving sample <<<)
  SAMPLE_ENABLED := 0
 endif
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 # Common includes and paths for CUDA
 INCLUDES  := -I../../../Common
 LIBRARIES :=
 ################################################################################
 # libNVRTC specific libraries
 ifeq ($(TARGET_OS),darwin)
 LDFLAGS += -L$(CUDA_PATH)/lib -F/Library/Frameworks -framework CUDA
 endif
 ifeq ($(TARGET_OS),darwin)
  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
 else
  ifeq ($(TARGET_ARCH),x86_64)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib/stubs
    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
  endif
  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
  endif
  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
  endif
  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
  endif
  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
  endif
  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
  endif
  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
  endif
  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
    ifdef TARGET_OVERRIDE
        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
    endif
  endif
  ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
  endif
  ifeq ($(HOST_ARCH),ppc64le)
    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
  endif
  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
  ifeq ("$(CUDALIB)","")
    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
    SAMPLE_ENABLED := 0
  else
    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
    LIBRARIES += -L$(CUDALIB) -lcuda
  endif
 endif
 ALL_CCFLAGS += --threads 0 --std=c++11
 INCLUDES += -I$(CUDA_PATH)/include
 LIBRARIES += -lnvrtc
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
 ################################################################################
 # Target rules
 all: build
 build: clock_nvrtc
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
 	@echo "Sample will be waived due to the above missing dependencies"
 else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 clock.o:clock.cpp
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 clock_nvrtc: clock.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 run: build
 	$(EXEC) ./clock_nvrtc
 testrun: build
 clean:
 	rm -f clock_nvrtc clock.o
 	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/clock_nvrtc
 clobber: clean
--- a/Samples/0_Introduction/clock_nvrtc/README.md
+++ b/Samples/0_Introduction/clock_nvrtc/README.md
@ -10,7 +10,7 @@ Performance Strategies, Runtime Compilation
 ## Supported SM Architectures
-[SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)
+[SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.9 ](https://developer.nvidia.com/cuda-gpus)  [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
@ -18,60 +18,22 @@ Linux, Windows, QNX
 ## Supported CPU Architecture
-x86_64, ppc64le, aarch64
+x86_64, aarch64
 ## CUDA APIs involved
 ### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html)
-cuModuleGetFunction, cuMemAlloc, cuLaunchKernel, cuCtxSynchronize, cuMemFree, cuMemcpyDtoH, cuMemcpyHtoD
+cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuCtxSynchronize, cuMemAlloc, cuMemFree, cuModuleGetFunction
 ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
 cudaBlockSize, cudaGridSize
 ## Dependencies needed to build/run
-[NVRTC](../../README.md#nvrtc)
+[NVRTC](../../../README.md#nvrtc)
 ## Prerequisites
-Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## Build and Run
 ### Windows
 The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
 ```
 *_vs<version>.sln - for Visual Studio <version>
 ```
 Each individual sample has its own set of solution files in its directory:
 To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
 > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
 `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
    ```
    $ make SMS="50 60"
    ```
 *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
 ```
    $ make HOST_COMPILER=g++
 ```
 ## References (for more details)
--- a/Samples/0_Introduction/clock_nvrtc/clock.cpp
+++ b/Samples/0_Introduction/clock_nvrtc/clock.cpp
@ -34,12 +34,11 @@
 */
 // System includes
 #include <stdio.h>
 #include <stdint.h>
 #include <assert.h>
 #include <cuda_runtime.h>
 #include <nvrtc_helper.h>
 #include <stdint.h>
 #include <stdio.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
@ -71,64 +70,68 @@
 // Start the main CUDA Sample here
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("CUDA Clock sample\n");
+{
    printf("CUDA Clock sample\n");
-  typedef long clock_t;
+    typedef long clock_t;
-  clock_t timer[NUM_BLOCKS * 2];
+    clock_t timer[NUM_BLOCKS * 2];
-  float input[NUM_THREADS * 2];
+    float input[NUM_THREADS * 2];
-  for (int i = 0; i < NUM_THREADS * 2; i++) {
+    for (int i = 0; i < NUM_THREADS * 2; i++) {
-    input[i] = (float)i;
+        input[i] = (float)i;
-  }
+    }
-  char *cubin, *kernel_file;
+    char  *cubin, *kernel_file;
-  size_t cubinSize;
+    size_t cubinSize;
-  kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
+    kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
-  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
+    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
-  CUmodule module = loadCUBIN(cubin, argc, argv);
+    CUmodule   module = loadCUBIN(cubin, argc, argv);
-  CUfunction kernel_addr;
+    CUfunction kernel_addr;
-  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
+    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
-  dim3 cudaBlockSize(NUM_THREADS, 1, 1);
+    dim3 cudaBlockSize(NUM_THREADS, 1, 1);
-  dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
+    dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
-  CUdeviceptr dinput, doutput, dtimer;
+    CUdeviceptr dinput, doutput, dtimer;
-  checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
+    checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
-  checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
+    checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
-  checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
+    checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
-  checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
+    checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
-  void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
+    void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
-  checkCudaErrors(cuLaunchKernel(
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
-      kernel_addr, cudaGridSize.x, cudaGridSize.y,
+                                   cudaGridSize.x,
-      cudaGridSize.z,                                    /* grid dim */
+                                   cudaGridSize.y,
-      cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
+                                   cudaGridSize.z, /* grid dim */
-      sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */
+                                   cudaBlockSize.x,
-      &arr[0],                            /* arguments */
+                                   cudaBlockSize.y,
-      0));
+                                   cudaBlockSize.z, /* block dim */
                                   sizeof(float) * 2 * NUM_THREADS,
                                   0,       /* shared mem, stream */
                                   &arr[0], /* arguments */
                                   0));
-  checkCudaErrors(cuCtxSynchronize());
+    checkCudaErrors(cuCtxSynchronize());
-  checkCudaErrors(
+    checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
-      cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
+    checkCudaErrors(cuMemFree(dinput));
-  checkCudaErrors(cuMemFree(dinput));
+    checkCudaErrors(cuMemFree(doutput));
-  checkCudaErrors(cuMemFree(doutput));
+    checkCudaErrors(cuMemFree(dtimer));
  checkCudaErrors(cuMemFree(dtimer));
-  long double avgElapsedClocks = 0;
+    long double avgElapsedClocks = 0;
-  for (int i = 0; i < NUM_BLOCKS; i++) {
+    for (int i = 0; i < NUM_BLOCKS; i++) {
-    avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
+        avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
-  }
+    }
-  avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
+    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
-  printf("Average clocks/block = %Lf\n", avgElapsedClocks);
+    printf("Average clocks/block = %Lf\n", avgElapsedClocks);
-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu
+++ b/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu
@ -37,38 +37,41 @@
 // time it takes to do that for each block. The timing results are stored
 // in device memory.
-extern "C" __global__ void timedReduction(const float *input, float *output,
+extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
-                                          clock_t *timer) {
+{
-  // __shared__ float shared[2 * blockDim.x];
+    // __shared__ float shared[2 * blockDim.x];
-  extern __shared__ float shared[];
+    extern __shared__ float shared[];
-  const int tid = threadIdx.x;
+    const int tid = threadIdx.x;
-  const int bid = blockIdx.x;
+    const int bid = blockIdx.x;
-  if (tid == 0) timer[bid] = clock();
+    if (tid == 0)
        timer[bid] = clock();
-  // Copy input.
+    // Copy input.
-  shared[tid] = input[tid];
+    shared[tid]              = input[tid];
-  shared[tid + blockDim.x] = input[tid + blockDim.x];
+    shared[tid + blockDim.x] = input[tid + blockDim.x];
    // Perform reduction to find minimum.
    for (int d = blockDim.x; d > 0; d /= 2) {
        __syncthreads();
        if (tid < d) {
            float f0 = shared[tid];
            float f1 = shared[tid + d];
            if (f1 < f0) {
                shared[tid] = f1;
            }
        }
    }
    // Write result.
    if (tid == 0)
        output[bid] = shared[0];
  // Perform reduction to find minimum.
  for (int d = blockDim.x; d > 0; d /= 2) {
    __syncthreads();
-    if (tid < d) {
+    if (tid == 0)
-      float f0 = shared[tid];
+        timer[bid + gridDim.x] = clock();
      float f1 = shared[tid + d];
      if (f1 < f0) {
        shared[tid] = f1;
      }
    }
  }
  // Write result.
  if (tid == 0) output[bid] = shared[0];
  __syncthreads();
  if (tid == 0) timer[bid + gridDim.x] = clock();
 }
--- a/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2017.sln
+++ b/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2017.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2017
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock_nvrtc", "clock_nvrtc_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2017.vcxproj
+++ b/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2017.vcxproj
@ -1,112 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>clock_nvrtc_vs2017</RootNamespace>
    <ProjectName>clock_nvrtc</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/clock_nvrtc.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration></CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="clock.cpp" />
    <None Include="clock_kernel.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2019.sln
+++ b/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2019.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2019
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock_nvrtc", "clock_nvrtc_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2019.vcxproj
+++ b/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2019.vcxproj
@ -1,108 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>clock_nvrtc_vs2019</RootNamespace>
    <ProjectName>clock_nvrtc</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v142</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/clock_nvrtc.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration></CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="clock.cpp" />
    <None Include="clock_kernel.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2022.sln
+++ b/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2022.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2022
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock_nvrtc", "clock_nvrtc_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2022.vcxproj
+++ b/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2022.vcxproj
@ -1,108 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>clock_nvrtc_vs2022</RootNamespace>
    <ProjectName>clock_nvrtc</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v143</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/clock_nvrtc.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration></CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="clock.cpp" />
    <None Include="clock_kernel.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/concurrentKernels/Makefile
+++ b/Samples/0_Introduction/concurrentKernels/Makefile
@ -1,340 +0,0 @@
 ################################################################################
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #  * Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 #  * Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #  * Neither the name of NVIDIA CORPORATION nor the names of its
 #    contributors may be used to endorse or promote products derived
 #    from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
 # Makefile project only supported on Mac OS X and Linux Platforms)
 #
 ################################################################################
 # Location of the CUDA Toolkit
 CUDA_PATH ?= /usr/local/cuda
 ##############################
 # start deprecated interface #
 ##############################
 ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
 endif
 ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
 endif
 ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
 endif
 ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
 endif
 ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
 endif
 ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
 endif
 ############################
 # end deprecated interface #
 ############################
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
 ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
 else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
 # sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
 ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
 endif
 # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
 endif
 # operating system
 HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 TARGET_OS ?= $(HOST_OS)
 ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 endif
 # host compiler
 ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
 else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
 endif
 HOST_COMPILER ?= g++
 NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 # internal flags
 NVCCFLAGS   := -m${TARGET_SIZE}
 CCFLAGS     :=
 LDFLAGS     :=
 # build flags
 ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
 else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
 else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif
        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
 endif
 ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
 endif
 # Install directory of different arch
 CUDA_INSTALL_TARGET_DIR :=
 ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
 else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
 endif
 # Debug build flags
 ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
 else
      BUILD_TYPE := release
 endif
 ALL_CCFLAGS :=
 ALL_CCFLAGS += $(NVCCFLAGS)
 ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 SAMPLE_ENABLED := 1
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 # Common includes and paths for CUDA
 INCLUDES  := -I../../../Common
 LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
 SMS ?= 53 61 70 72 75 80 86 87
 else
 SMS ?= 35 37 50 52 60 61 70 75 80 86
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
 SAMPLE_ENABLED := 0
 endif
 ifeq ($(GENCODE_FLAGS),)
 # Generate SASS code for each SM architecture listed in $(SMS)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
 HIGHEST_SM := $(lastword $(sort $(SMS)))
 ifneq ($(HIGHEST_SM),)
 GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 ALL_CCFLAGS += --threads 0 --std=c++11
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
 ################################################################################
 # Target rules
 all: build
 build: concurrentKernels
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
 	@echo "Sample will be waived due to the above missing dependencies"
 else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 concurrentKernels.o:concurrentKernels.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 concurrentKernels: concurrentKernels.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 run: build
 	$(EXEC) ./concurrentKernels
 testrun: build
 clean:
 	rm -f concurrentKernels concurrentKernels.o
 	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/concurrentKernels
 clobber: clean
--- a/Samples/0_Introduction/concurrentKernels/NsightEclipse.xml
+++ b/Samples/0_Introduction/concurrentKernels/NsightEclipse.xml
@ -1,87 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?> 
 <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 <entry>
  <name>concurrentKernels</name>
  <cuda_api_list>
    <toolkit>cudaStreamWaitEvent</toolkit>
    <toolkit>cudaStreamDestroy</toolkit>
    <toolkit>cudaFree</toolkit>
    <toolkit>cudaEventRecord</toolkit>
    <toolkit>cudaMallocHost</toolkit>
    <toolkit>cudaStreamCreate</toolkit>
    <toolkit>cudaEventCreate</toolkit>
    <toolkit>cudaEventElapsedTime</toolkit>
    <toolkit>cudaEventSynchronize</toolkit>
    <toolkit>cudaFreeHost</toolkit>
    <toolkit>cudaMalloc</toolkit>
    <toolkit>cudaEventCreateWithFlags</toolkit>
    <toolkit>cudaEventDestroy</toolkit>
    <toolkit>cudaMemcpyAsync</toolkit>
    <toolkit>cudaGetDeviceProperties</toolkit>
    <toolkit>cudaGetDevice</toolkit>
  </cuda_api_list>
  <description><![CDATA[This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.]]></description>
  <devicecompilation>whole</devicecompilation>
  <includepaths>
    <path>./</path>
    <path>../</path>
    <path>../../../Common</path>
  </includepaths>
  <keyconcepts>
    <concept level="advanced">Performance Strategies</concept>
  </keyconcepts>
  <keywords>
    <keyword>CUDA</keyword>
    <keyword>Concurrent Kernels</keyword>
  </keywords>
  <libraries>
  </libraries>
  <librarypaths>
  </librarypaths>
  <nsight_eclipse>true</nsight_eclipse>
  <primary_file>concurrentKernels.cu</primary_file>
  <scopes>
    <scope>1:CUDA Advanced Topics</scope>
    <scope>1:Performance Strategies</scope>
  </scopes>
  <sm-arch>sm35</sm-arch>
  <sm-arch>sm37</sm-arch>
  <sm-arch>sm50</sm-arch>
  <sm-arch>sm52</sm-arch>
  <sm-arch>sm53</sm-arch>
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <sm-arch>sm80</sm-arch>
  <sm-arch>sm86</sm-arch>
  <sm-arch>sm87</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
      <platform>linux</platform>
    </env>
    <env>
      <platform>windows7</platform>
    </env>
    <env>
      <arch>x86_64</arch>
      <platform>macosx</platform>
    </env>
    <env>
      <arch>arm</arch>
    </env>
    <env>
      <arch>sbsa</arch>
    </env>
    <env>
      <arch>ppc64le</arch>
      <platform>linux</platform>
    </env>
  </supported_envs>
  <supported_sm_architectures>
    <include>all</include>
  </supported_sm_architectures>
  <title>Concurrent Kernels</title>
 </entry>
--- a/Samples/0_Introduction/concurrentKernels/README.md
+++ b/Samples/0_Introduction/concurrentKernels/README.md
@ -1,70 +0,0 @@
 # concurrentKernels - Concurrent Kernels
 ## Description
 This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.
 ## Key Concepts
 Performance Strategies
 ## Supported SM Architectures
 [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
 Linux, Windows
 ## Supported CPU Architecture
 x86_64, ppc64le, armv7l
 ## CUDA APIs involved
 ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
 cudaStreamWaitEvent, cudaStreamDestroy, cudaFree, cudaEventRecord, cudaMallocHost, cudaStreamCreate, cudaEventCreate, cudaEventElapsedTime, cudaEventSynchronize, cudaFreeHost, cudaMalloc, cudaEventCreateWithFlags, cudaEventDestroy, cudaMemcpyAsync, cudaGetDeviceProperties, cudaGetDevice
 ## Prerequisites
 Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## Build and Run
 ### Windows
 The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
 ```
 *_vs<version>.sln - for Visual Studio <version>
 ```
 Each individual sample has its own set of solution files in its directory:
 To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
 > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
 `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
    ```
    $ make SMS="50 60"
    ```
 *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
 ```
    $ make HOST_COMPILER=g++
 ```
 ## References (for more details)
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels.cu
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels.cu
@ -1,228 +0,0 @@
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 //
 // This sample demonstrates the use of streams for concurrent execution. It also
 // illustrates how to introduce dependencies between CUDA streams with the
 // cudaStreamWaitEvent function.
 //
 // Devices of compute capability 2.0 or higher can overlap the kernels
 //
 #include <cooperative_groups.h>
 #include <stdio.h>
 namespace cg = cooperative_groups;
 #include <helper_cuda.h>
 #include <helper_functions.h>
 // This is a kernel that does no real work but runs at least for a specified
 // number of clocks
 __global__ void clock_block(clock_t *d_o, clock_t clock_count) {
  unsigned int start_clock = (unsigned int)clock();
  clock_t clock_offset = 0;
  while (clock_offset < clock_count) {
    unsigned int end_clock = (unsigned int)clock();
    // The code below should work like
    // this (thanks to modular arithmetics):
    //
    // clock_offset = (clock_t) (end_clock > start_clock ?
    //                           end_clock - start_clock :
    //                           end_clock + (0xffffffffu - start_clock));
    //
    // Indeed, let m = 2^32 then
    // end - start = end + m - start (mod m).
    clock_offset = (clock_t)(end_clock - start_clock);
  }
  d_o[0] = clock_offset;
 }
 // Single warp reduction kernel
 __global__ void sum(clock_t *d_clocks, int N) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  __shared__ clock_t s_clocks[32];
  clock_t my_sum = 0;
  for (int i = threadIdx.x; i < N; i += blockDim.x) {
    my_sum += d_clocks[i];
  }
  s_clocks[threadIdx.x] = my_sum;
  cg::sync(cta);
  for (int i = 16; i > 0; i /= 2) {
    if (threadIdx.x < i) {
      s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
    }
    cg::sync(cta);
  }
  d_clocks[0] = s_clocks[0];
 }
 int main(int argc, char **argv) {
  int nkernels = 8;             // number of concurrent kernels
  int nstreams = nkernels + 1;  // use one more stream than concurrent kernel
  int nbytes = nkernels * sizeof(clock_t);  // number of data bytes
  float kernel_time = 10;                   // time the kernel should run in ms
  float elapsed_time;                       // timing variables
  int cuda_device = 0;
  printf("[%s] - Starting...\n", argv[0]);
  // get number of kernels if overridden on the command line
  if (checkCmdLineFlag(argc, (const char **)argv, "nkernels")) {
    nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
    nstreams = nkernels + 1;
  }
  // use command-line specified CUDA device, otherwise use device with highest
  // Gflops/s
  cuda_device = findCudaDevice(argc, (const char **)argv);
  cudaDeviceProp deviceProp;
  checkCudaErrors(cudaGetDevice(&cuda_device));
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
  if ((deviceProp.concurrentKernels == 0)) {
    printf("> GPU does not support concurrent kernel execution\n");
    printf("  CUDA kernel runs will be serialized\n");
  }
  printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
         deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
  // allocate host memory
  clock_t *a = 0;  // pointer to the array data in host memory
  checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
  // allocate device memory
  clock_t *d_a = 0;  // pointers to data and init value in the device memory
  checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
  // allocate and initialize an array of stream handles
  cudaStream_t *streams =
      (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
  for (int i = 0; i < nstreams; i++) {
    checkCudaErrors(cudaStreamCreate(&(streams[i])));
  }
  // create CUDA event handles
  cudaEvent_t start_event, stop_event;
  checkCudaErrors(cudaEventCreate(&start_event));
  checkCudaErrors(cudaEventCreate(&stop_event));
  // the events are used for synchronization only and hence do not need to
  // record timings this also makes events not introduce global sync points when
  // recorded which is critical to get overlap
  cudaEvent_t *kernelEvent;
  kernelEvent = (cudaEvent_t *)malloc(nkernels * sizeof(cudaEvent_t));
  for (int i = 0; i < nkernels; i++) {
    checkCudaErrors(
        cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
  }
  //////////////////////////////////////////////////////////////////////
  // time execution with nkernels streams
  clock_t total_clocks = 0;
 #if defined(__arm__) || defined(__aarch64__)
  // the kernel takes more time than the channel reset time on arm archs, so to
  // prevent hangs reduce time_clocks.
  clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100));
 #else
  clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
 #endif
  cudaEventRecord(start_event, 0);
  // queue nkernels in separate streams and record when they are done
  for (int i = 0; i < nkernels; ++i) {
    clock_block<<<1, 1, 0, streams[i]>>>(&d_a[i], time_clocks);
    total_clocks += time_clocks;
    checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
    // make the last stream wait for the kernel event to be recorded
    checkCudaErrors(
        cudaStreamWaitEvent(streams[nstreams - 1], kernelEvent[i], 0));
  }
  // queue a sum kernel and a copy back to host in the last stream.
  // the commands in this stream get dispatched as soon as all the kernel events
  // have been recorded
  sum<<<1, 32, 0, streams[nstreams - 1]>>>(d_a, nkernels);
  checkCudaErrors(cudaMemcpyAsync(
      a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams - 1]));
  // at this point the CPU has dispatched all work for the GPU and can continue
  // processing other tasks in parallel
  // in this sample we just wait until the GPU is done
  checkCudaErrors(cudaEventRecord(stop_event, 0));
  checkCudaErrors(cudaEventSynchronize(stop_event));
  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
  printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels,
         nkernels * kernel_time / 1000.0f);
  printf("Expected time for concurrent execution of %d kernels = %.3fs\n",
         nkernels, kernel_time / 1000.0f);
  printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
  bool bTestResult = (a[0] > total_clocks);
  // release resources
  for (int i = 0; i < nkernels; i++) {
    cudaStreamDestroy(streams[i]);
    cudaEventDestroy(kernelEvent[i]);
  }
  free(streams);
  free(kernelEvent);
  cudaEventDestroy(start_event);
  cudaEventDestroy(stop_event);
  cudaFreeHost(a);
  cudaFree(d_a);
  if (!bTestResult) {
    printf("Test failed!\n");
    exit(EXIT_FAILURE);
  }
  printf("Test passed\n");
  exit(EXIT_SUCCESS);
 }
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2017.sln
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2017.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2017
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2017.vcxproj
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2017.vcxproj
@ -1,112 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>concurrentKernels_vs2017</RootNamespace>
    <ProjectName>concurrentKernels</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="concurrentKernels.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2019.sln
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2019.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2019
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2019.vcxproj
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2019.vcxproj
@ -1,108 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>concurrentKernels_vs2019</RootNamespace>
    <ProjectName>concurrentKernels</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v142</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="concurrentKernels.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2022.sln
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2022.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2022
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2022.vcxproj
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2022.vcxproj
@ -1,108 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>concurrentKernels_vs2022</RootNamespace>
    <ProjectName>concurrentKernels</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v143</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="concurrentKernels.cu" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/cppIntegration/Makefile
+++ b/Samples/0_Introduction/cppIntegration/Makefile
@ -1,346 +0,0 @@
 ################################################################################
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #  * Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 #  * Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #  * Neither the name of NVIDIA CORPORATION nor the names of its
 #    contributors may be used to endorse or promote products derived
 #    from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
 # Makefile project only supported on Mac OS X and Linux Platforms)
 #
 ################################################################################
 # Location of the CUDA Toolkit
 CUDA_PATH ?= /usr/local/cuda
 ##############################
 # start deprecated interface #
 ##############################
 ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
 endif
 ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
 endif
 ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
 endif
 ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
 endif
 ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
 endif
 ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
 endif
 ############################
 # end deprecated interface #
 ############################
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
 ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
 else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
 # sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
 ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
 endif
 # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
 endif
 # operating system
 HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 TARGET_OS ?= $(HOST_OS)
 ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 endif
 # host compiler
 ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
 else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
 endif
 HOST_COMPILER ?= g++
 NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 # internal flags
 NVCCFLAGS   := -m${TARGET_SIZE}
 CCFLAGS     :=
 LDFLAGS     :=
 # build flags
 ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
 else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
 else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif
        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
 endif
 ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
 endif
 # Install directory of different arch
 CUDA_INSTALL_TARGET_DIR :=
 ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
 else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
 endif
 # Debug build flags
 ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
 else
      BUILD_TYPE := release
 endif
 ALL_CCFLAGS :=
 ALL_CCFLAGS += $(NVCCFLAGS)
 ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 SAMPLE_ENABLED := 1
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 # Common includes and paths for CUDA
 INCLUDES  := -I../../../Common
 LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
 SMS ?= 53 61 70 72 75 80 86 87
 else
 SMS ?= 35 37 50 52 60 61 70 75 80 86
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
 SAMPLE_ENABLED := 0
 endif
 ifeq ($(GENCODE_FLAGS),)
 # Generate SASS code for each SM architecture listed in $(SMS)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
 HIGHEST_SM := $(lastword $(sort $(SMS)))
 ifneq ($(HIGHEST_SM),)
 GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 ALL_CCFLAGS += --threads 0 --std=c++11
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
 ################################################################################
 # Target rules
 all: build
 build: cppIntegration
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
 	@echo "Sample will be waived due to the above missing dependencies"
 else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 cppIntegration.o:cppIntegration.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 cppIntegration_gold.o:cppIntegration_gold.cpp
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 main.o:main.cpp
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 cppIntegration: cppIntegration.o cppIntegration_gold.o main.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 run: build
 	$(EXEC) ./cppIntegration
 testrun: build
 clean:
 	rm -f cppIntegration cppIntegration.o cppIntegration_gold.o main.o
 	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cppIntegration
 clobber: clean
--- a/Samples/0_Introduction/cppIntegration/NsightEclipse.xml
+++ b/Samples/0_Introduction/cppIntegration/NsightEclipse.xml
@ -1,72 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?> 
 <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 <entry>
  <name>cppIntegration</name>
  <cuda_api_list>
    <toolkit>cudaMalloc</toolkit>
    <toolkit>cudaFree</toolkit>
    <toolkit>cudaMemcpy</toolkit>
  </cuda_api_list>
  <description><![CDATA[This example demonstrates how to integrate CUDA into an existing C++ application, i.e. the CUDA entry point on host side is only a function which is called from C++ code and only the file containing this function is compiled with nvcc. It also demonstrates that vector types can be used from cpp.]]></description>
  <devicecompilation>whole</devicecompilation>
  <includepaths>
    <path>./</path>
    <path>../</path>
    <path>../../../Common</path>
  </includepaths>
  <keyconcepts>
    <concept level="basic">CPP-CUDA Integration</concept>
  </keyconcepts>
  <keywords>
  </keywords>
  <libraries>
  </libraries>
  <librarypaths>
  </librarypaths>
  <nsight_eclipse>true</nsight_eclipse>
  <primary_file>cppIntegration.cu</primary_file>
  <scopes>
    <scope>1:CUDA Basic Topics</scope>
  </scopes>
  <sm-arch>sm35</sm-arch>
  <sm-arch>sm37</sm-arch>
  <sm-arch>sm50</sm-arch>
  <sm-arch>sm52</sm-arch>
  <sm-arch>sm53</sm-arch>
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <sm-arch>sm80</sm-arch>
  <sm-arch>sm86</sm-arch>
  <sm-arch>sm87</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
      <platform>linux</platform>
    </env>
    <env>
      <platform>windows7</platform>
    </env>
    <env>
      <arch>x86_64</arch>
      <platform>macosx</platform>
    </env>
    <env>
      <arch>arm</arch>
    </env>
    <env>
      <arch>sbsa</arch>
    </env>
    <env>
      <arch>ppc64le</arch>
      <platform>linux</platform>
    </env>
  </supported_envs>
  <supported_sm_architectures>
    <include>all</include>
  </supported_sm_architectures>
  <title>C++ Integration</title>
  <type>exe</type>
 </entry>
--- a/Samples/0_Introduction/cppIntegration/README.md
+++ b/Samples/0_Introduction/cppIntegration/README.md
@ -1,70 +0,0 @@
 # cppIntegration - C++ Integration
 ## Description
 This example demonstrates how to integrate CUDA into an existing C++ application, i.e. the CUDA entry point on host side is only a function which is called from C++ code and only the file containing this function is compiled with nvcc. It also demonstrates that vector types can be used from cpp.
 ## Key Concepts
 CPP-CUDA Integration
 ## Supported SM Architectures
 [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)
 ## Supported OSes
 Linux, Windows
 ## Supported CPU Architecture
 x86_64, ppc64le, armv7l
 ## CUDA APIs involved
 ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
 cudaMalloc, cudaFree, cudaMemcpy
 ## Prerequisites
 Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## Build and Run
 ### Windows
 The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
 ```
 *_vs<version>.sln - for Visual Studio <version>
 ```
 Each individual sample has its own set of solution files in its directory:
 To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
 > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
 $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
 *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
 `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
    ```
    $ make dbg=1
    ```
 *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
    ```
    $ make SMS="50 60"
    ```
 *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
 ```
    $ make HOST_COMPILER=g++
 ```
 ## References (for more details)
--- a/Samples/0_Introduction/cppIntegration/cppIntegration.cu
+++ b/Samples/0_Introduction/cppIntegration/cppIntegration.cu
@ -1,172 +0,0 @@
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /* 
 * Example of integrating CUDA functions into an existing
 * application / framework.
 * Host part of the device code.
 * Compiled with Cuda compiler.
 */
 // System includes
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <assert.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 // helper functions and utilities to work with CUDA
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // declaration, forward
 extern "C" void computeGold(char *reference, char *idata,
                            const unsigned int len);
 extern "C" void computeGold2(int2 *reference, int2 *idata,
                             const unsigned int len);
 ///////////////////////////////////////////////////////////////////////////////
 //! Simple test kernel for device functionality
 //! @param g_odata  memory to process (in and out)
 ///////////////////////////////////////////////////////////////////////////////
 __global__ void kernel(int *g_data) {
  // write data to global memory
  const unsigned int tid = threadIdx.x;
  int data = g_data[tid];
  // use integer arithmetic to process all four bytes with one thread
  // this serializes the execution, but is the simplest solutions to avoid
  // bank conflicts for this very low number of threads
  // in general it is more efficient to process each byte by a separate thread,
  // to avoid bank conflicts the access pattern should be
  // g_data[4 * wtid + wid], where wtid is the thread id within the half warp
  // and wid is the warp id
  // see also the programming guide for a more in depth discussion.
  g_data[tid] =
      ((((data << 0) >> 24) - 10) << 24) | ((((data << 8) >> 24) - 10) << 16) |
      ((((data << 16) >> 24) - 10) << 8) | ((((data << 24) >> 24) - 10) << 0);
 }
 ///////////////////////////////////////////////////////////////////////////////
 //! Demonstration that int2 data can be used in the cpp code
 //! @param g_odata  memory to process (in and out)
 ///////////////////////////////////////////////////////////////////////////////
 __global__ void kernel2(int2 *g_data) {
  // write data to global memory
  const unsigned int tid = threadIdx.x;
  int2 data = g_data[tid];
  // use integer arithmetic to process all four bytes with one thread
  // this serializes the execution, but is the simplest solutions to avoid
  // bank conflicts for this very low number of threads
  // in general it is more efficient to process each byte by a separate thread,
  // to avoid bank conflicts the access pattern should be
  // g_data[4 * wtid + wid], where wtid is the thread id within the half warp
  // and wid is the warp id
  // see also the programming guide for a more in depth discussion.
  g_data[tid].x = data.x - data.y;
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Entry point for Cuda functionality on host side
 //! @param argc  command line argument count
 //! @param argv  command line arguments
 //! @param data  data to process on the device
 //! @param len   len of \a data
 ////////////////////////////////////////////////////////////////////////////////
 extern "C" bool runTest(const int argc, const char **argv, char *data,
                        int2 *data_int2, unsigned int len) {
  // use command-line specified CUDA device, otherwise use device with highest
  // Gflops/s
  findCudaDevice(argc, (const char **)argv);
  const unsigned int num_threads = len / 4;
  assert(0 == (len % 4));
  const unsigned int mem_size = sizeof(char) * len;
  const unsigned int mem_size_int2 = sizeof(int2) * len;
  // allocate device memory
  char *d_data;
  checkCudaErrors(cudaMalloc((void **)&d_data, mem_size));
  // copy host memory to device
  checkCudaErrors(cudaMemcpy(d_data, data, mem_size, cudaMemcpyHostToDevice));
  // allocate device memory for int2 version
  int2 *d_data_int2;
  checkCudaErrors(cudaMalloc((void **)&d_data_int2, mem_size_int2));
  // copy host memory to device
  checkCudaErrors(cudaMemcpy(d_data_int2, data_int2, mem_size_int2,
                             cudaMemcpyHostToDevice));
  // setup execution parameters
  dim3 grid(1, 1, 1);
  dim3 threads(num_threads, 1, 1);
  dim3 threads2(len, 1, 1);  // more threads needed fir separate int2 version
  // execute the kernel
  kernel<<<grid, threads>>>((int *)d_data);
  kernel2<<<grid, threads2>>>(d_data_int2);
  // check if kernel execution generated and error
  getLastCudaError("Kernel execution failed");
  // compute reference solutions
  char *reference = (char *)malloc(mem_size);
  computeGold(reference, data, len);
  int2 *reference2 = (int2 *)malloc(mem_size_int2);
  computeGold2(reference2, data_int2, len);
  // copy results from device to host
  checkCudaErrors(cudaMemcpy(data, d_data, mem_size, cudaMemcpyDeviceToHost));
  checkCudaErrors(cudaMemcpy(data_int2, d_data_int2, mem_size_int2,
                             cudaMemcpyDeviceToHost));
  // check result
  bool success = true;
  for (unsigned int i = 0; i < len; i++) {
    if (reference[i] != data[i] || reference2[i].x != data_int2[i].x ||
        reference2[i].y != data_int2[i].y) {
      success = false;
    }
  }
  // cleanup memory
  checkCudaErrors(cudaFree(d_data));
  checkCudaErrors(cudaFree(d_data_int2));
  free(reference);
  free(reference2);
  return success;
 }
--- a/Samples/0_Introduction/cppIntegration/cppIntegration_gold.cpp
+++ b/Samples/0_Introduction/cppIntegration/cppIntegration_gold.cpp
@ -1,67 +0,0 @@
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /*
 * Example of integrating CUDA functions into an existing
 * application / framework.
 * Reference solution computation.
 */
 // Required header to support CUDA vector types
 #include <vector_types.h>
 ////////////////////////////////////////////////////////////////////////////////
 // export C interface
 extern "C" void computeGold(char *reference, char *idata,
                            const unsigned int len);
 extern "C" void computeGold2(int2 *reference, int2 *idata,
                             const unsigned int len);
 ////////////////////////////////////////////////////////////////////////////////
 //! Compute reference data set
 //! Each element is multiplied with the number of threads / array length
 //! @param reference  reference data, computed but preallocated
 //! @param idata      input data as provided to device
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
 void computeGold(char *reference, char *idata, const unsigned int len) {
  for (unsigned int i = 0; i < len; ++i) reference[i] = idata[i] - 10;
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Compute reference data set for int2 version
 //! Each element is multiplied with the number of threads / array length
 //! @param reference  reference data, computed but preallocated
 //! @param idata      input data as provided to device
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
 void computeGold2(int2 *reference, int2 *idata, const unsigned int len) {
  for (unsigned int i = 0; i < len; ++i) {
    reference[i].x = idata[i].x - idata[i].y;
    reference[i].y = idata[i].y;
  }
 }
--- a/Samples/0_Introduction/cppIntegration/cppIntegration_vs2017.sln
+++ b/Samples/0_Introduction/cppIntegration/cppIntegration_vs2017.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2017
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cppIntegration", "cppIntegration_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/cppIntegration/cppIntegration_vs2017.vcxproj
+++ b/Samples/0_Introduction/cppIntegration/cppIntegration_vs2017.vcxproj
@ -1,114 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>cppIntegration_vs2017</RootNamespace>
    <ProjectName>cppIntegration</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v141</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/cppIntegration.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="cppIntegration.cu" />
    <ClCompile Include="cppIntegration_gold.cpp" />
    <ClCompile Include="main.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/cppIntegration/cppIntegration_vs2019.sln
+++ b/Samples/0_Introduction/cppIntegration/cppIntegration_vs2019.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2019
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cppIntegration", "cppIntegration_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/cppIntegration/cppIntegration_vs2019.vcxproj
+++ b/Samples/0_Introduction/cppIntegration/cppIntegration_vs2019.vcxproj
@ -1,110 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>cppIntegration_vs2019</RootNamespace>
    <ProjectName>cppIntegration</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v142</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/cppIntegration.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="cppIntegration.cu" />
    <ClCompile Include="cppIntegration_gold.cpp" />
    <ClCompile Include="main.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/cppIntegration/cppIntegration_vs2022.sln
+++ b/Samples/0_Introduction/cppIntegration/cppIntegration_vs2022.sln
@ -1,20 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2022
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cppIntegration", "cppIntegration_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/Samples/0_Introduction/cppIntegration/cppIntegration_vs2022.vcxproj
+++ b/Samples/0_Introduction/cppIntegration/cppIntegration_vs2022.vcxproj
@ -1,110 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <PropertyGroup>
    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  </PropertyGroup>
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
    <RootNamespace>cppIntegration_vs2022</RootNamespace>
    <ProjectName>cppIntegration</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
    <PlatformToolset>v143</PlatformToolset>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Release'">
    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup>
    <IntDir>$(Platform)/$(Configuration)/</IntDir>
    <IncludePath>$(IncludePath)</IncludePath>
    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
    <CodeAnalysisRules />
    <CodeAnalysisRuleAssemblies />
  </PropertyGroup>
  <PropertyGroup Condition="'$(Platform)'=='x64'">
    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
      <OutputFile>$(OutDir)/cppIntegration.exe</OutputFile>
    </Link>
    <CudaCompile>
      <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
      <Include>./;../../../Common</Include>
      <Defines>WIN32</Defines>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
    <ClCompile>
      <Optimization>Disabled</Optimization>
      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MTd</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>false</GenerateDebugInformation>
      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
    </Link>
    <CudaCompile>
      <Runtime>MT</Runtime>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CudaCompile Include="cppIntegration.cu" />
    <ClCompile Include="cppIntegration_gold.cpp" />
    <ClCompile Include="main.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/0_Introduction/cppIntegration/main.cpp
+++ b/Samples/0_Introduction/cppIntegration/main.cpp
@ -1,86 +0,0 @@
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 /*
 * Example of integrating CUDA functions into an existing
 * application / framework.
 * CPP code representing the existing application / framework.
 * Compiled with default CPP compiler.
 */
 // includes, system
 #include <iostream>
 #include <stdlib.h>
 // Required to include CUDA vector types
 #include <cuda_runtime.h>
 #include <vector_types.h>
 #include <helper_cuda.h>
 ////////////////////////////////////////////////////////////////////////////////
 // declaration, forward
 extern "C" bool runTest(const int argc, const char **argv, char *data,
                        int2 *data_int2, unsigned int len);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
 int main(int argc, char **argv) {
  // input data
  int len = 16;
  // the data has some zero padding at the end so that the size is a multiple of
  // four, this simplifies the processing as each thread can process four
  // elements (which is necessary to avoid bank conflicts) but no branching is
  // necessary to avoid out of bounds reads
  char str[] = {82,  111, 118, 118, 121, 42, 97, 121,
                124, 118, 110, 56,  10,  10, 10, 10};
  // Use int2 showing that CUDA vector types can be used in cpp code
  int2 i2[16];
  for (int i = 0; i < len; i++) {
    i2[i].x = str[i];
    i2[i].y = 10;
  }
  bool bTestResult;
  // run the device part of the program
  bTestResult = runTest(argc, (const char **)argv, str, i2, len);
  std::cout << str << std::endl;
  char str_device[16];
  for (int i = 0; i < len; i++) {
    str_device[i] = (char)(i2[i].x);
  }
  std::cout << str_device << std::endl;
  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/cppOverload/Makefile
+++ b/Samples/0_Introduction/cppOverload/Makefile
@ -1,340 +0,0 @@
 ################################################################################
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #  * Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 #  * Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #  * Neither the name of NVIDIA CORPORATION nor the names of its
 #    contributors may be used to endorse or promote products derived
 #    from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
 # Makefile project only supported on Mac OS X and Linux Platforms)
 #
 ################################################################################
 # Location of the CUDA Toolkit
 CUDA_PATH ?= /usr/local/cuda
 ##############################
 # start deprecated interface #
 ##############################
 ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
 endif
 ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
 endif
 ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
 endif
 ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
 endif
 ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
 endif
 ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
 endif
 ############################
 # end deprecated interface #
 ############################
 # architecture
 HOST_ARCH   := $(shell uname -m)
 TARGET_ARCH ?= $(HOST_ARCH)
 ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
 else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 endif
 # sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
 ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
 endif
 # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
 endif
 # operating system
 HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 TARGET_OS ?= $(HOST_OS)
 ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 endif
 # host compiler
 ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
 else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
 endif
 HOST_COMPILER ?= g++
 NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 # internal flags
 NVCCFLAGS   := -m${TARGET_SIZE}
 CCFLAGS     :=
 LDFLAGS     :=
 # build flags
 ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
 else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
 else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
 endif
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif
        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
 endif
 ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
 endif
 # Install directory of different arch
 CUDA_INSTALL_TARGET_DIR :=
 ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
 else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
 else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
 endif
 # Debug build flags
 ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
 else
      BUILD_TYPE := release
 endif
 ALL_CCFLAGS :=
 ALL_CCFLAGS += $(NVCCFLAGS)
 ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 SAMPLE_ENABLED := 1
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 # Common includes and paths for CUDA
 INCLUDES  := -I../../../Common
 LIBRARIES :=
 ################################################################################
 # Gencode arguments
 ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
 SMS ?= 53 61 70 72 75 80 86 87
 else
 SMS ?= 35 37 50 52 60 61 70 75 80 86
 endif
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
 SAMPLE_ENABLED := 0
 endif
 ifeq ($(GENCODE_FLAGS),)
 # Generate SASS code for each SM architecture listed in $(SMS)
 $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
 HIGHEST_SM := $(lastword $(sort $(SMS)))
 ifneq ($(HIGHEST_SM),)
 GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 ALL_CCFLAGS += --threads 0 --std=c++11
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
 endif
 ################################################################################
 # Target rules
 all: build
 build: cppOverload
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
 	@echo "Sample will be waived due to the above missing dependencies"
 else
 	@echo "Sample is ready - all dependencies have been met"
 endif
 cppOverload.o:cppOverload.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 cppOverload: cppOverload.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 run: build
 	$(EXEC) ./cppOverload
 testrun: build
 clean:
 	rm -f cppOverload cppOverload.o
 	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cppOverload
 clobber: clean
--- a/Show More
+++ b/Show More