Removing stray cpp from v12.1

2026-03-25 11:25:43 +08:00 · 2023-05-31 17:46:09 +00:00
3385 changed files with 672423 additions and 126917 deletions
--- a/.clang-format
+++ b/.clang-format
@ -1,49 +0,0 @@
---
-AccessModifierOffset: -4
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: Consecutive
-AlignConsecutiveDeclarations: Consecutive
-AlignConsecutiveMacros: Consecutive
-AlignEscapedNewlines: Left
-AlignOperands: AlignAfterOperator
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: false
-BinPackArguments: false
-BinPackParameters: false
-BraceWrapping:
-    AfterClass: true
-    AfterControlStatement: false
-    AfterExternBlock: true
-    AfterFunction: true
-    AfterStruct: true
-    AfterUnion: true
-    BeforeCatch: true
-    BeforeElse: true
-    IndentBraces: false
-BreakBeforeBraces: Custom
-BreakBeforeConceptDeclarations: true
-BreakBeforeBinaryOperators: NonAssignment
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializers: BeforeComma
-BreakInheritanceList: BeforeComma
-ColumnLimit: 120
-DerivePointerAlignment: false
-FixNamespaceComments: true
-IncludeCategories:
-  - Regex:           '^<.*>'
-    Priority:        1
-  - Regex:           '^".*"'
-    Priority:        2
-SortIncludes: true
-IncludeBlocks: Regroup
-IndentWidth: 4
-MaxEmptyLinesToKeep: 2
-PointerAlignment: Right
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: ControlStatements
-Standard: c++17
-TabWidth: 4
-UseTab: Never
-...
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1 @@
-build
-.vs
-.clangd
-test
-settings.json
-launch.json
+.vscode/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,106 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-ci:
-    autofix_commit_msg: |
-      [pre-commit.ci] auto code formatting
-    autofix_prs: false
-    autoupdate_branch: ''
-    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
-    autoupdate_schedule: quarterly
-    skip: []
-    submodules: false
-
-repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
-    hooks:
-      - id: end-of-file-fixer
-        exclude: |
-          (?x)^(
-            .*\.raw$|
-            .*\.bin$|
-            .*\.dat$|
-            .*\.nv12$|
-            data/.*|
-            Common/.*
-          )
-        files: |
-          (?x)^(
-            .*\.txt$|
-            .*\.md$|
-            .*\.cpp$|
-            .*\.cxx$|
-            .*\.hpp$|
-            .*\.h$|
-            .*\.cu$|
-            .*\.cuh$|
-            .*\.py$|
-            .*\.json$
-          )
-      - id: mixed-line-ending
-        exclude: |
-          (?x)^(
-            .*\.raw$|
-            .*\.bin$|
-            .*\.dat$|
-            .*\.nv12$|
-            data/.*|
-            Common/.*
-          )
-        files: |
-          (?x)^(
-            .*\.txt$|
-            .*\.md$|
-            .*\.cpp$|
-            .*\.cxx$|
-            .*\.hpp$|
-            .*\.h$|
-            .*\.cu$|
-            .*\.cuh$|
-            .*\.py$|
-            .*\.json$
-          )
-      - id: trailing-whitespace
-        exclude: |
-          (?x)^(
-            .*\.raw$|
-            .*\.bin$|
-            .*\.dat$|
-            .*\.nv12$|
-            data/.*|
-            Common/.*
-          )
-        files: |
-          (?x)^(
-            .*\.txt$|
-            .*\.md$|
-            .*\.cpp$|
-            .*\.cxx$|
-            .*\.hpp$|
-            .*\.h$|
-            .*\.cu$|
-            .*\.cuh$|
-            .*\.py$|
-            .*\.json$
-          )
-  - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.6
-    hooks:
-      - id: clang-format
-        types_or: [file]
-        files: |
-          (?x)^(
-            ^.*\.c$|
-            ^.*\.cpp$|
-            ^.*\.cu$|
-            ^.*\.cuh$|
-            ^.*\.cxx$|
-            ^.*\.h$|
-            ^.*\.hpp$|
-            ^.*\.inl$|
-            ^.*\.mm$
-          )
-        exclude: |
-          (?x)^(
-            Common/.*
-          )
-        args: ["-fallback-style=none", "-style=file", "-i"]
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,161 +1,10 @@
 ## Changelog

-### CUDA 13.1
-* Minor bug fixes and enhancements, no structural or functional changes
-
-### CUDA 13.0
-* Updated the samples using the cudaDeviceProp fields which are deprecated and removed in CUDA 13.0, replacing the fields with the equivalents in "cudaDeviceGetAttribute":
-    * Deprecated "cudaDeviceProp" fields
-        `int clockRate; // - Replaced with "cudaDevAttrClockRate"`
-        `int deviceOverlap; // - Replaced with "cudaDevAttrGpuOverlap */`
-        `int kernelExecTimeoutEnabled; // - Replaced with "cudaDevAttrKernelExecTimeout`
-        `int computeMode; // - Replaced with "cudaDevAttrComputeMode" */`
-        `int memoryClockRate; // - Replaced with "cudaDevAttrMemoryClockRate"`
-        `int cooperativeMultiDeviceLaunch; // - Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated.`
-    * `0_Introduction`
-        * `UnifiedMemoryStreams`
-        * `simpleHyperQ`
-        * `simpleIPC`
-        * `simpleMultiCopy`
-        * `systemWideAtomics`
-    * `1_Utilitie`
-        * `deviceQuery`
-    * `2_Concepts_and_Techniques`
-        * `streamOrderedAllocationIPC`
-    * `4_CUDA_Libraries`
-        * `simpleCUBLASXT`
-    * `5_Domain_Specific`
-        * `simpleVulkan`
-        * `vulkanImageCUDA`
-* Updated the samples using the CUDA driver API "cuCtxCreate" with adding the parameter "CUctxCreateParams" as "cuCtxCreate" is updated to "cuCtxCreate_v4" by default in CUDA 13.0:
-    * `Common`
-        * `nvrtc_helper.h`
-    * `0_Introduction`
-        * `UnifiedMemoryStreams`
-        * `matrixMulDrv`
-        * `simpleTextureDrv`
-        * `vectorAddDrv`
-        * `vectorAddMMAP`
-    * `2_Concepts_and_Techniques`
-        * `EGLStream_CUDA_CrossGPU`
-        * `EGLStream_CUDA_Interop`
-        * `threadMigration`
-    * `3_CUDA_Features`
-        * `graphMemoryFootprint`
-        * `memMapIPCDrv`
-    * `4_CUDA_Libraries`
-        * `jitLto`
-    * `7_libNVVM`
-        * `cuda-c-linking`
-        * `device-side-launch`
-        * `simple`
-        * `uvmlite`
-    * `8_Platform_Specific/Tegra`
-        * `EGLSync_CUDAEvent_Interop`
-* Updated the sample using CUDA API "cudaGraphAddNode"/"cudaStreamGetCaptureInfo" with adding "cudaGraphEdgeData" pointer parameter as they are updated to "cudaGraphAddNode_v2"/"cudaStreamGetCaptureInfo_v3" by default in CUDA 13.0:
-    * `3_CUDA_Features`
-        * `graphConditionalNodes`
-* Updated the samples using CUDA API "cudaMemAdvise"/"cudaMemPrefetchAsync" with changing the parameter "int device" to "cudaMemLocation location" as they are updated to "cudaMemAdvise_v2"/"cudaMemPrefetchAsyn_v2" by default in CUDA 13.0.
-    * `4_CUDA_Libraries`
-        * `conjugateGradientMultiDeviceCG`
-    * `6_Performance`
-        * `UnifiedMemoryPerf`
-* Replaced "thrust::identity<uint>()" with "cuda::std::identity()" as it is deprecated in CUDA 13.0.
-    * `2_Concepts_and_Techniques`
-        * `segmentationTreeThrust`
-* Updated the the headers file and samples for CUFFT error codes update.
-    * Deprecated CUFFT errors:
-        * `CUFFT_INCOMPLETE_PARAMETER_LIST`
-        * `CUFFT_PARSE_ERROR`
-        * `CUFFT_LICENSE_ERROR`
-    * New added CUFFT errors:
-        * `CUFFT_MISSING_DEPENDENCY`
-        * `CUFFT_NVRTC_FAILURE`
-        * `CUFFT_NVJITLINK_FAILURE`
-        * `CUFFT_NVSHMEM_FAILURE`
-    * Header files and samples that are related with this change:
-        * `Common/helper_cuda.h`
-        * `4_CUDA_Libraries`
-            * `simpleCUFFT`
-            * `simpleCUFFT_2d_MGPU`
-            * `simpleCUFFT_MGPU`
-            * `simpleCUFFT_callback`
-* Updated toolchain for cross-compilation for Tegra QNX platforms.
-
-### CUDA 12.9
-* Updated toolchain for cross-compilation for Tegra Linux platforms.
-* Added `run_tests.py` utility to exercise all samples. See README.md for details
-* Repository has been updated with consistent code formatting across all samples
-* Many small code tweaks and bug fixes (see commit history for details)
-* Removed the following outdated samples:
-  * `1_Utilities`
-    * `bandwidthTest` - this sample was out of date and did not produce accurate results. For bandwidth
-    testing of NVIDIA GPU platforms, please refer to [NVBandwidth](https://github.com/NVIDIA/nvbandwidth)
-
-### CUDA 12.8
-* Updated build system across the repository to CMake. Removed Visual Studio project files and Makefiles.
-* Removed the following outdated samples:
-    * `0_Introduction`
-        * `c++11_cuda` demonstrating CUDA and C++ 11 interoperability (reason: obsolete)
-        * `concurrentKernels` demonstrating the ability to run multiple kernels simultaneously (reason: obsolete)
-        * `cppIntegration` demonstrating calling between .cu and .cpp files (reason: obsolete)
-        * `cppOverload` demonstrating C++ function overloading (reason: obsolete)
-        * `simpleSeparateCompilation` demonstrating NVCC compilation to a static library (reason: trivial)
-        * `simpleTemplates_nvrtc` demonstrating NVRTC usage for `simpleTemplates` sample (reason: redundant)
-        * `simpleVoteIntrinsics_nvrtc` demonstrating NVRTC usage for `simpleVoteIntrinsics` sample (reason: redundant)
-    * `2_Concepts_and_Techniques`
-        * `cuHook` demonstrating dlsym hooks. (reason: incompatible with modern `glibc`)
-    * `4_CUDA_Libraries`
-        * `batchedLabelMarkersAndLabelCompressionNPP` demonstrating NPP features (reason: some functionality removed from library)
-    * `5_Domain_Specific`
-        * Legacy Direct3D 9 and 10 interoperability samples:
-            * `fluidsD3D9`
-            * `simpleD3D10`
-            * `simpleD3D10RenderTarget`
-            * `simpleD3D10Texture`
-            * `simpleD3D9`
-            * `simpleD3D9Texture`
-            * `SLID3D10Texture`
-            * `VFlockingD3D10`
-    * `8_Platform_Specific/Tegra`
-        * Temporarily removed the following two samples pending updates:
-            * `nbody_screen` demonstrating the nbody sample in QNX
-            * `simpleGLES_screen` demonstrating GLES interop in QNX
-* Moved the following Tegra-specific samples to a dedicated subdirectory: `8_Platform_Specific/Tegra`
-    * `EGLSync_CUDAEvent_Interop`
-    * `cuDLAErrorReporting`
-    * `cuDLAHybridMode`
-    * `cuDLALayerwiseStatsHybrid`
-    * `cuDLALayerwiseStatsStandalone`
-    * `cuDLAStandaloneMode`
-    * `cudaNvSciBufMultiplanar`
-    * `cudaNvSciNvMedia`
-    * `fluidsGLES`
-    * `nbody_opengles`
-    * `simpleGLES`
-    * `simpleGLES_EGLOutput`
-
-
-
-### CUDA 12.5
-
-### CUDA 12.4
-* Added graphConditionalNodes Sample
-
-### CUDA 12.3
-* Added cuDLA samples
-* Fixed jitLto regression
-
-### CUDA 12.2
-* libNVVM samples received updates
-* Fixed jitLto Case issues
-* Enabled HOST_COMPILER flag to the makefiles for GCC which is untested but may still work.
-
 ### CUDA 12.1
 * Added new sample for Large Kernels

 ### CUDA 12.0
-* Added new flags for JIT compiling
+* Added new flags for JIT compiling 
 * Removed deprecated APIs in Hopper Architecture

 ### CUDA 11.6
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,28 +0,0 @@
-cmake_minimum_required(VERSION 3.20)
-
-project(cuda-samples LANGUAGES C CXX CUDA)
-
-find_package(CUDAToolkit REQUIRED)
-
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-
-set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
-endif()
-
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
-
-# Include installation configuration before processing samples
-include(cmake/InstallSamples.cmake)
-
-add_subdirectory(Samples)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,103 +0,0 @@
-
-# Contributing to the CUDA Samples
-
-Thank you for your interest in contributing to the CUDA Samples!
-
-
-## Getting Started
-
-1. **Fork & Clone the Repository**:
-
-   Fork the reporistory and clone the fork. For more information, check [GitHub's documentation on forking](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo) and [cloning a repository](https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/cloning-a-repository).
-
-## Making Changes
-
-1. **Create a New Branch**:
-
-   ```bash
-   git checkout -b your-feature-branch
-   ```
-
-2. **Make Changes**.
-
-3. **Build and Test**:
-
-   Ensure changes don't break existing functionality by building and running tests.
-
-   For more details on building and testing, refer to the [Building and Testing](#building-and-testing) section below.
-
-4. **Commit Changes**:
-
-   ```bash
-   git commit -m "Brief description of the change"
-   ```
-
-## Building and Testing
-
-For information on building a running tests on the samples, please refer to the main [README](README.md)
-
-## Creating a Pull Request
-
-1. Push changes to your fork
-2. Create a pull request targeting the `master` branch of the original CUDA Samples repository. Refer to [GitHub's documentation](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) for more information on creating a pull request.
-3. Describe the purpose and context of the changes in the pull request description.
-
-## Code Formatting (pre-commit hooks)
-
-The CUDA Samples repository uses [pre-commit](https://pre-commit.com/) to execute all code linters and formatters. These
-tools ensure a consistent coding style throughout the project. Using pre-commit ensures that linter
-versions and options are aligned for all developers. Additionally, there is a CI check in place to
-enforce that committed code follows our standards.
-
-The linters used by the CUDA Samples are listed in `.pre-commit-config.yaml`.
-For example, C++ and CUDA code is formatted with [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html).
-
-To use `pre-commit`, install via `conda` or `pip`:
-
-```bash
-conda config --add channels conda-forge
-conda install pre-commit
-```
-
-```bash
-pip install pre-commit
-```
-
-Then run pre-commit hooks before committing code:
-
-```bash
-pre-commit run
-```
-
-By default, pre-commit runs on staged files (only changes and additions that will be committed).
-To run pre-commit checks on all files, execute:
-
-```bash
-pre-commit run --all-files
-```
-
-Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running:
-
-```bash
-pre-commit install
-```
-
-Now code linters and formatters will be run each time you commit changes.
-
-You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`, althoguh please note
-that this may result in pull requests being rejected if subsequent checks fail.
-
-## Review Process
-
-Once submitted, maintainers will be automatically assigned to review the pull request. They might suggest changes or improvements. Constructive feedback is a part of the collaborative process, aimed at ensuring the highest quality code.
-
-For constructive feedback and effective communication during reviews, we recommend following [Conventional Comments](https://conventionalcomments.org/).
-
-Further recommended reading for successful PR reviews:
-
- [How to Do Code Reviews Like a Human (Part One)](https://mtlynch.io/human-code-reviews-1/)
- [How to Do Code Reviews Like a Human (Part Two)](https://mtlynch.io/human-code-reviews-2/)
-
-## Thank You
-
-Your contributions enhance the CUDA Samples for the entire community. We appreciate your effort and collaboration!
--- a/Common/dynlink_d3d10.h
+++ b/Common/dynlink_d3d10.h
@ -0,0 +1,294 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//--------------------------------------------------------------------------------------
+// File: dynlink_d3d10.h
+//
+// Shortcut macros and functions for using DX objects
+//
+// Copyright (c) Microsoft Corporation. All rights reserved
+//--------------------------------------------------------------------------------------
+
+#ifndef _DYNLINK_D3D10_H_
+#define _DYNLINK_D3D10_H_
+
+// Standard Windows includes
+#include <windows.h>
+#include <initguid.h>
+#include <assert.h>
+#include <wchar.h>
+#include <mmsystem.h>
+#include <commctrl.h> // for InitCommonControls() 
+#include <shellapi.h> // for ExtractIcon()
+#include <new.h>      // for placement new
+#include <shlobj.h>
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
+
+// CRT's memory leak detection
+#if defined(DEBUG) || defined(_DEBUG)
+#include <crtdbg.h>
+#endif
+
+// Direct3D9 includes
+#include <d3d9.h>
+
+// Direct3D10 includes
+#include <dxgi.h>
+#include <d3d10_1.h>
+#include <d3d10.h>
+
+// XInput includes
+#include <xinput.h>
+
+// strsafe.h deprecates old unsecure string functions.  If you
+// really do not want to it to (not recommended), then uncomment the next line
+//#define STRSAFE_NO_DEPRECATE
+
+#ifndef STRSAFE_NO_DEPRECATE
+#pragma deprecated("strncpy")
+#pragma deprecated("wcsncpy")
+#pragma deprecated("_tcsncpy")
+#pragma deprecated("wcsncat")
+#pragma deprecated("strncat")
+#pragma deprecated("_tcsncat")
+#endif
+
+#pragma warning( disable : 4996 ) // disable deprecated warning 
+#include <strsafe.h>
+#pragma warning( default : 4996 )
+
+#include <DirectXMath.h>
+
+using namespace DirectX;
+//--------------------------------------------------------------------------------------
+// Structs
+//--------------------------------------------------------------------------------------
+struct DXUTD3D9DeviceSettings
+{
+    UINT AdapterOrdinal;
+    D3DDEVTYPE DeviceType;
+    D3DFORMAT AdapterFormat;
+    DWORD BehaviorFlags;
+    D3DPRESENT_PARAMETERS pp;
+};
+
+struct DXUTD3D10DeviceSettings
+{
+    UINT AdapterOrdinal;
+    D3D10_DRIVER_TYPE DriverType;
+    UINT Output;
+    DXGI_SWAP_CHAIN_DESC sd;
+    UINT32 CreateFlags;
+    UINT32 SyncInterval;
+    DWORD PresentFlags;
+    bool AutoCreateDepthStencil; // DXUT will create the a depth stencil resource and view if true
+    DXGI_FORMAT AutoDepthStencilFormat;
+};
+
+enum DXUTDeviceVersion { DXUT_D3D9_DEVICE, DXUT_D3D10_DEVICE };
+struct DXUTDeviceSettings
+{
+    DXUTDeviceVersion ver;
+    union
+    {
+        DXUTD3D9DeviceSettings d3d9; // only valid if ver == DXUT_D3D9_DEVICE
+        DXUTD3D10DeviceSettings d3d10; // only valid if ver == DXUT_D3D10_DEVICE
+    };
+};
+
+
+//--------------------------------------------------------------------------------------
+// Error codes
+//--------------------------------------------------------------------------------------
+#define DXUTERR_NODIRECT3D              MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0901)
+#define DXUTERR_NOCOMPATIBLEDEVICES     MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0902)
+#define DXUTERR_MEDIANOTFOUND           MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0903)
+#define DXUTERR_NONZEROREFCOUNT         MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0904)
+#define DXUTERR_CREATINGDEVICE          MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0905)
+#define DXUTERR_RESETTINGDEVICE         MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0906)
+#define DXUTERR_CREATINGDEVICEOBJECTS   MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0907)
+#define DXUTERR_RESETTINGDEVICEOBJECTS  MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0908)
+#define DXUTERR_DEVICEREMOVED           MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x090A)
+
+
+typedef HRESULT(WINAPI *LPCREATEDXGIFACTORY)(REFIID, void **);
+typedef HRESULT(WINAPI *LPD3D10CREATEDEVICE)(IDXGIAdapter *, D3D10_DRIVER_TYPE, HMODULE, UINT, UINT32,
+                                             ID3D10Device **);
+typedef HRESULT(WINAPI *LPD3D10CREATEDEVICE1)(IDXGIAdapter *, D3D10_DRIVER_TYPE, HMODULE, UINT,
+                                              D3D10_FEATURE_LEVEL1, UINT, ID3D10Device1 **);
+typedef HRESULT(WINAPI *LPD3D10CREATESTATEBLOCK)(ID3D10Device *pDevice, D3D10_STATE_BLOCK_MASK *pStateBlockMask,
+                                                 ID3D10StateBlock **ppStateBlock);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKUNION)(D3D10_STATE_BLOCK_MASK *pA, D3D10_STATE_BLOCK_MASK *pB,
+                                                    D3D10_STATE_BLOCK_MASK *pResult);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKINTERSECT)(D3D10_STATE_BLOCK_MASK *pA, D3D10_STATE_BLOCK_MASK *pB,
+                                                        D3D10_STATE_BLOCK_MASK *pResult);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKDIFFERENCE)(D3D10_STATE_BLOCK_MASK *pA, D3D10_STATE_BLOCK_MASK *pB,
+                                                         D3D10_STATE_BLOCK_MASK *pResult);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKENABLECAPTURE)(D3D10_STATE_BLOCK_MASK *pMask,
+                                                            D3D10_DEVICE_STATE_TYPES StateType, UINT RangeStart,
+                                                            UINT RangeLength);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKDISABLECAPTURE)(D3D10_STATE_BLOCK_MASK *pMask,
+        D3D10_DEVICE_STATE_TYPES StateType, UINT RangeStart,
+        UINT RangeLength);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKENABLEALL)(D3D10_STATE_BLOCK_MASK *pMask);
+typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKDISABLEALL)(D3D10_STATE_BLOCK_MASK *pMask);
+typedef BOOL (WINAPI *LPD3D10STATEBLOCKMASKGETSETTING)(D3D10_STATE_BLOCK_MASK *pMask,
+                                                       D3D10_DEVICE_STATE_TYPES StateType, UINT Entry);
+
+typedef HRESULT(WINAPI *LPD3D10COMPILEEFFECTFROMMEMORY)(void *pData, SIZE_T DataLength, LPCSTR pSrcFileName,
+                                                        CONST D3D10_SHADER_MACRO *pDefines,
+                                                        ID3D10Include *pInclude, UINT HLSLFlags, UINT FXFlags,
+                                                        ID3D10Blob **ppCompiledEffect, ID3D10Blob **ppErrors);
+typedef HRESULT(WINAPI *LPD3D10CREATEEFFECTFROMMEMORY)(void *pData, SIZE_T DataLength, UINT FXFlags,
+                                                       ID3D10Device *pDevice,
+                                                       ID3D10EffectPool *pEffectPool,
+                                                       ID3D10Effect **ppEffect);
+typedef HRESULT(WINAPI *LPD3D10CREATEEFFECTPOOLFROMMEMORY)(void *pData, SIZE_T DataLength, UINT FXFlags,
+                                                           ID3D10Device *pDevice, ID3D10EffectPool **ppEffectPool);
+
+typedef HRESULT(WINAPI *LPD3D10CREATEDEVICEANDSWAPCHAIN)(IDXGIAdapter *pAdapter,
+                                                         D3D10_DRIVER_TYPE DriverType,
+                                                         HMODULE Software,
+                                                         UINT Flags,
+                                                         UINT SDKVersion,
+                                                         DXGI_SWAP_CHAIN_DESC *pSwapChainDesc,
+                                                         IDXGISwapChain **ppSwapChain,
+                                                         ID3D10Device **ppDevice);
+
+typedef HRESULT(WINAPI *LPD3D10CREATEDEVICEANDSWAPCHAIN1)(IDXGIAdapter *pAdapter,
+                                                          D3D10_DRIVER_TYPE DriverType,
+                                                          HMODULE Software,
+                                                          UINT Flags,
+                                                          D3D10_FEATURE_LEVEL1 HardwareLevel,
+                                                          UINT SDKVersion,
+                                                          DXGI_SWAP_CHAIN_DESC *pSwapChainDesc,
+                                                          IDXGISwapChain **ppSwapChain,
+                                                          ID3D10Device1 **ppDevice);
+
+// Module and function pointers
+static HMODULE                              g_hModDXGI = NULL;
+static HMODULE                              g_hModD3D10 = NULL;
+static HMODULE                              g_hModD3D101 = NULL;
+static LPCREATEDXGIFACTORY                  sFnPtr_CreateDXGIFactory = NULL;
+static LPD3D10CREATESTATEBLOCK              sFnPtr_D3D10CreateStateBlock = NULL;
+static LPD3D10CREATEDEVICE                  sFnPtr_D3D10CreateDevice = NULL;
+static LPD3D10CREATEDEVICE1                 sFnPtr_D3D10CreateDevice1 = NULL;
+static LPD3D10STATEBLOCKMASKUNION           sFnPtr_D3D10StateBlockMaskUnion = NULL;
+static LPD3D10STATEBLOCKMASKINTERSECT       sFnPtr_D3D10StateBlockMaskIntersect = NULL;
+static LPD3D10STATEBLOCKMASKDIFFERENCE      sFnPtr_D3D10StateBlockMaskDifference = NULL;
+static LPD3D10STATEBLOCKMASKENABLECAPTURE   sFnPtr_D3D10StateBlockMaskEnableCapture = NULL;
+static LPD3D10STATEBLOCKMASKDISABLECAPTURE  sFnPtr_D3D10StateBlockMaskDisableCapture = NULL;
+static LPD3D10STATEBLOCKMASKENABLEALL       sFnPtr_D3D10StateBlockMaskEnableAll = NULL;
+static LPD3D10STATEBLOCKMASKDISABLEALL      sFnPtr_D3D10StateBlockMaskDisableAll = NULL;
+static LPD3D10STATEBLOCKMASKGETSETTING      sFnPtr_D3D10StateBlockMaskGetSetting = NULL;
+static LPD3D10COMPILEEFFECTFROMMEMORY       sFnPtr_D3D10CompileEffectFromMemory = NULL;
+static LPD3D10CREATEEFFECTFROMMEMORY        sFnPtr_D3D10CreateEffectFromMemory = NULL;
+static LPD3D10CREATEEFFECTPOOLFROMMEMORY    sFnPtr_D3D10CreateEffectPoolFromMemory = NULL;
+static LPD3D10CREATEDEVICEANDSWAPCHAIN      sFnPtr_D3D10CreateDeviceAndSwapChain  = NULL;
+static LPD3D10CREATEDEVICEANDSWAPCHAIN1     sFnPtr_D3D10CreateDeviceAndSwapChain1 = NULL;
+
+// unload the D3D10 DLLs
+static bool dynlinkUnloadD3D10API(void)
+{
+    if (g_hModD3D10)
+    {
+        FreeLibrary(g_hModD3D10);
+        g_hModD3D10 = NULL;
+    }
+
+    if (g_hModDXGI)
+    {
+        FreeLibrary(g_hModDXGI);
+        g_hModDXGI = NULL;
+    }
+
+    if (g_hModD3D101)
+    {
+        FreeLibrary(g_hModD3D101);
+        g_hModD3D101 = NULL;
+    }
+
+    return true;
+}
+
+// Dynamically load the D3D10 DLLs loaded and map the function pointers
+static bool dynlinkLoadD3D10API(void)
+{
+    // First check to see if the D3D10 Library is present.
+    // if it succeeds, then we can call GetProcAddress to grab all of the DX10 functions
+    g_hModD3D10 = LoadLibrary("d3d10.dll");
+
+    if (g_hModD3D10 != NULL)
+    {
+        sFnPtr_D3D10CreateStateBlock             = (LPD3D10CREATESTATEBLOCK)           GetProcAddress(g_hModD3D10, "D3D10CreateStateBlock");
+        sFnPtr_D3D10CreateDevice                 = (LPD3D10CREATEDEVICE)           GetProcAddress(g_hModD3D10, "D3D10CreateDevice");
+
+        sFnPtr_D3D10StateBlockMaskUnion          = (LPD3D10STATEBLOCKMASKUNION)        GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskUnion");
+        sFnPtr_D3D10StateBlockMaskIntersect      = (LPD3D10STATEBLOCKMASKINTERSECT)    GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskIntersect");
+        sFnPtr_D3D10StateBlockMaskDifference     = (LPD3D10STATEBLOCKMASKDIFFERENCE)   GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskDifference");
+        sFnPtr_D3D10StateBlockMaskEnableCapture  = (LPD3D10STATEBLOCKMASKENABLECAPTURE) GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskEnableCapture");
+        sFnPtr_D3D10StateBlockMaskDisableCapture = (LPD3D10STATEBLOCKMASKDISABLECAPTURE)GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskDisableCapture");
+
+        sFnPtr_D3D10StateBlockMaskEnableAll      = (LPD3D10STATEBLOCKMASKENABLEALL)    GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskEnableAll");
+        sFnPtr_D3D10StateBlockMaskDisableAll     = (LPD3D10STATEBLOCKMASKDISABLEALL)   GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskDisableAll");
+        sFnPtr_D3D10StateBlockMaskGetSetting     = (LPD3D10STATEBLOCKMASKGETSETTING)   GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskGetSetting");
+
+        sFnPtr_D3D10CompileEffectFromMemory      = (LPD3D10COMPILEEFFECTFROMMEMORY)    GetProcAddress(g_hModD3D10, "D3D10CompileEffectFromMemory");
+        sFnPtr_D3D10CreateEffectFromMemory       = (LPD3D10CREATEEFFECTFROMMEMORY)     GetProcAddress(g_hModD3D10, "D3D10CreateEffectFromMemory");
+        sFnPtr_D3D10CreateEffectPoolFromMemory   = (LPD3D10CREATEEFFECTPOOLFROMMEMORY) GetProcAddress(g_hModD3D10, "D3D10CreateEffectPoolFromMemory");
+
+        sFnPtr_D3D10CreateDeviceAndSwapChain     = (LPD3D10CREATEDEVICEANDSWAPCHAIN)    GetProcAddress(g_hModD3D10, "D3D10CreateDeviceAndSwapChain");
+    }
+
+    g_hModDXGI = LoadLibrary("dxgi.dll");
+
+    if (g_hModDXGI)
+    {
+        sFnPtr_CreateDXGIFactory                 = (LPCREATEDXGIFACTORY)           GetProcAddress(g_hModDXGI , "CreateDXGIFactory");
+    }
+
+    // This may fail if this machine isn't Windows Vista SP1 or later
+    g_hModD3D101 = LoadLibrary("d3d10_1.dll");
+
+    if (g_hModD3D101 != NULL)
+    {
+        sFnPtr_D3D10CreateDevice1                = (LPD3D10CREATEDEVICE1)              GetProcAddress(g_hModD3D101, "D3D10CreateDevice1");
+        sFnPtr_D3D10CreateDeviceAndSwapChain1    = (LPD3D10CREATEDEVICEANDSWAPCHAIN1)   GetProcAddress(g_hModD3D101, "D3D10CreateDeviceAndSwapChain1");
+    }
+
+    if (g_hModD3D10 == NULL || g_hModDXGI == NULL || g_hModD3D101 == NULL)
+    {
+        dynlinkUnloadD3D10API();
+        return false;
+    }
+
+    return true;
+}
+
+#endif
--- a/Common/helper_cuda.h
+++ b/Common/helper_cuda.h
@ -138,29 +138,26 @@ static const char *_cudaGetErrorEnum(cufftResult error) {
    case CUFFT_UNALIGNED_DATA:
      return "CUFFT_UNALIGNED_DATA";

+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
    case CUFFT_INVALID_DEVICE:
      return "CUFFT_INVALID_DEVICE";

+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+
    case CUFFT_NO_WORKSPACE:
      return "CUFFT_NO_WORKSPACE";

    case CUFFT_NOT_IMPLEMENTED:
      return "CUFFT_NOT_IMPLEMENTED";

+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+
    case CUFFT_NOT_SUPPORTED:
      return "CUFFT_NOT_SUPPORTED";
-
-    case CUFFT_MISSING_DEPENDENCY:
-      return "CUFFT_MISSING_DEPENDENCY";
-
-    case CUFFT_NVRTC_FAILURE:
-      return "CUFFT_NVRTC_FAILURE";
-
-    case CUFFT_NVJITLINK_FAILURE:
-      return "CUFFT_NVJITLINK_FAILURE";
-
-    case CUFFT_NVSHMEM_FAILURE:
-      return "CUFFT_NVSHMEM_FAILURE";
  }

  return "<unknown>";
@ -671,12 +668,6 @@ inline int _ConvertSMVer2Cores(int major, int minor) {
      {0x87, 128},
      {0x89, 128},
      {0x90, 128},
-      {0xa0, 128},
-      {0xa1, 128},
-      {0xa3, 128},
-      {0xb0, 128},
-      {0xc0, 128},
-      {0xc1, 128},
      {-1, -1}};

  int index = 0;
@ -726,12 +717,6 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
      {0x87, "Ampere"},
      {0x89, "Ada"},
      {0x90, "Hopper"},
-      {0xa0, "Blackwell"},
-      {0xa1, "Blackwell"},
-      {0xa3, "Blackwell"},
-      {0xb0, "Blackwell"},
-      {0xc0, "Blackwell"},
-      {0xc1, "Blackwell"},
      {-1, "Graphics Device"}};

  int index = 0;
--- a/Common/helper_cuda_drvapi.h
+++ b/Common/helper_cuda_drvapi.h
@ -116,12 +116,6 @@ inline int _ConvertSMVer2CoresDRV(int major, int minor) {
      {0x87, 128},
      {0x89, 128},
      {0x90, 128},
-      {0xa0, 128},
-      {0xa1, 128},
-      {0xa3, 128},
-      {0xb0, 128},
-      {0xc0, 128},
-      {0xc1, 128},
      {-1, -1}};

  int index = 0;
@ -244,7 +238,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
      }

      unsigned long long compute_perf =
-          ((unsigned long long)multiProcessorCount * sm_per_multiproc *
+          (unsigned long long)(multiProcessorCount * sm_per_multiproc *
                               clockRate);

      if (compute_perf > max_compute_perf) {
@ -411,3 +405,4 @@ bool inline findFatbinPath(const char *module_file, std::string &module_path, ch
  // end of CUDA Helper Functions

 #endif  // COMMON_HELPER_CUDA_DRVAPI_H_
+
--- a/Common/helper_multiprocess.cpp
+++ b/Common/helper_multiprocess.cpp
@ -168,7 +168,7 @@ int waitProcess(Process *process) {
 #endif
 }

-#if defined(__linux__) || defined(__QNX__)
+#if defined(__linux__)
 int ipcCreateSocket(ipcHandle *&handle, const char *name,
                    const std::vector<Process> &processes) {
  int server_fd;
@ -185,30 +185,25 @@ int ipcCreateSocket(ipcHandle *&handle, const char *name,
    return -1;
  }

-  char path_name[50];
-
-  // Create unique name for the socket with path if SOCK_FOLDER is set.
-  sprintf(path_name, "%s/%u", getSocketFolder().c_str(), getpid());
-
-  unlink(path_name);
-  memset(&servaddr, 0, sizeof(servaddr));
+  unlink(name);
+  bzero(&servaddr, sizeof(servaddr));
  servaddr.sun_family = AF_UNIX;

-  size_t len = strlen(path_name);
+  size_t len = strlen(name);
  if (len > (sizeof(servaddr.sun_path) - 1)) {
    perror("IPC failure: Cannot bind provided name to socket. Name too large");
    return -1;
  }

-  strncpy(servaddr.sun_path, path_name, len);
+  strncpy(servaddr.sun_path, name, len);

  if (bind(server_fd, (struct sockaddr *)&servaddr, SUN_LEN(&servaddr)) < 0) {
    perror("IPC failure: Binding socket failed");
    return -1;
  }

-  handle->socketName = new char[strlen(path_name) + 1];
-  strcpy(handle->socketName, path_name);
+  handle->socketName = new char[strlen(name) + 1];
+  strcpy(handle->socketName, name);
  handle->socket = server_fd;
  return 0;
 }
@ -224,13 +219,13 @@ int ipcOpenSocket(ipcHandle *&handle) {
    perror("IPC failure:Socket creation error");
    return -1;
  }
-  
-  memset(&cliaddr, 0, sizeof(cliaddr));
-  cliaddr.sun_family = AF_UNIX;
-  char temp[50];

-  // Create unique name for the socket with path if SOCK_FOLDER is set.
-  sprintf(temp, "%s/%u", getSocketFolder().c_str(), getpid());
+  bzero(&cliaddr, sizeof(cliaddr));
+  cliaddr.sun_family = AF_UNIX;
+  char temp[10];
+
+  // Create unique name for the socket.
+  sprintf(temp, "%u", getpid());

  strcpy(cliaddr.sun_path, temp);
  if (bind(sock, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
@ -267,48 +262,41 @@ int ipcRecvShareableHandle(ipcHandle *handle, ShareableHandle *shHandle) {
  // Union to guarantee alignment requirements for control array
  union {
    struct cmsghdr cm;
-    // This will not work on QNX as QNX CMSG_SPACE calls __cmsg_alignbytes
-    // And __cmsg_alignbytes is a runtime function instead of compile-time macros
-    // char control[CMSG_SPACE(sizeof(int))]
-    char* control;
+    char control[CMSG_SPACE(sizeof(int))];
  } control_un;

-  size_t sizeof_control = CMSG_SPACE(sizeof(int)) * sizeof(char);
-  control_un.control = (char*) malloc(sizeof_control);
  struct cmsghdr *cmptr;
  ssize_t n;
  int receivedfd;
  char dummy_buffer[1];
  ssize_t sendResult;
+
  msg.msg_control = control_un.control;
-  msg.msg_controllen = sizeof_control;
+  msg.msg_controllen = sizeof(control_un.control);

  iov[0].iov_base = (void *)dummy_buffer;
  iov[0].iov_len = sizeof(dummy_buffer);

  msg.msg_iov = iov;
  msg.msg_iovlen = 1;
+
  if ((n = recvmsg(handle->socket, &msg, 0)) <= 0) {
    perror("IPC failure: Receiving data over socket failed");
-    free(control_un.control);
    return -1;
  }

  if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) &&
      (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
    if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
-      free(control_un.control);
      return -1;
    }

    memmove(&receivedfd, CMSG_DATA(cmptr), sizeof(receivedfd));
    *(int *)shHandle = receivedfd;
  } else {
-    free(control_un.control);
    return -1;
  }

-  free(control_un.control);
  return 0;
 }

@ -331,7 +319,7 @@ int ipcSendDataToServer(ipcHandle *handle, const char *serverName,
  ssize_t sendResult;
  struct sockaddr_un serveraddr;

-  memset(&serveraddr, 0, sizeof(serveraddr));
+  bzero(&serveraddr, sizeof(serveraddr));
  serveraddr.sun_family = AF_UNIX;
  strncpy(serveraddr.sun_path, serverName, sizeof(serveraddr.sun_path) - 1);

@ -352,22 +340,19 @@ int ipcSendShareableHandle(ipcHandle *handle,

  union {
    struct cmsghdr cm;
-    char* control;
+    char control[CMSG_SPACE(sizeof(int))];
  } control_un;

-  size_t sizeof_control = CMSG_SPACE(sizeof(int)) * sizeof(char);
-  control_un.control = (char*) malloc(sizeof_control);
-
  struct cmsghdr *cmptr;
  ssize_t readResult;
  struct sockaddr_un cliaddr;
  socklen_t len = sizeof(cliaddr);

  // Construct client address to send this SHareable handle to
-  memset(&cliaddr, 0, sizeof(cliaddr));
+  bzero(&cliaddr, sizeof(cliaddr));
  cliaddr.sun_family = AF_UNIX;
-  char temp[20];
-  sprintf(temp, "%s/%u", getSocketFolder().c_str(), process);
+  char temp[10];
+  sprintf(temp, "%u", process);
  strcpy(cliaddr.sun_path, temp);
  len = sizeof(cliaddr);

@ -375,7 +360,7 @@ int ipcSendShareableHandle(ipcHandle *handle,
  int sendfd = (int)shareableHandles[data];

  msg.msg_control = control_un.control;
-  msg.msg_controllen = sizeof_control;
+  msg.msg_controllen = sizeof(control_un.control);

  cmptr = CMSG_FIRSTHDR(&msg);
  cmptr->cmsg_len = CMSG_LEN(sizeof(int));
@ -395,11 +380,9 @@ int ipcSendShareableHandle(ipcHandle *handle,
  ssize_t sendResult = sendmsg(handle->socket, &msg, 0);
  if (sendResult <= 0) {
    perror("IPC failure: Sending data over socket failed");
-    free(control_un.control);
    return -1;
  }

-  free(control_un.control);
  return 0;
 }

--- a/Common/helper_multiprocess.h
+++ b/Common/helper_multiprocess.h
@ -54,27 +54,6 @@
 #endif
 #include <vector>

-// The Unix domain sockets creating folder on QNX has been restricted to qnx6-mounted directories since QNX SDP 8.0.3.
-#if defined(__QNX__)
-    #include <string>
-    inline std::string getSocketFolder() {
-        return "/storage";
-    }
-// Simple filesystem compatibility for GCC 8.x
-#elif defined(__GNUC__) && __GNUC__ < 9
-    #include <cstdlib>
-    #include <string>
-    inline std::string getSocketFolder() {
-        const char* tmpdir = std::getenv("TMPDIR");
-        return tmpdir ? std::string(tmpdir) : "/tmp";
-    }
-#else
-    #include <filesystem>
-    inline std::string getSocketFolder() {
-        return std::filesystem::temp_directory_path().string();
-    }
-#endif
-
 typedef struct sharedMemoryInfo_st {
    void *addr;
    size_t size;
@ -105,7 +84,7 @@ int waitProcess(Process *process);
 #define checkIpcErrors(ipcFuncResult) \
    if (ipcFuncResult == -1) { fprintf(stderr, "Failure at %u %s\n", __LINE__, __FILE__); exit(EXIT_FAILURE); }

-#if defined(__linux__) || defined(__QNX__)
+#if defined(__linux__)
 struct ipcHandle_st {
    int socket;
    char *socketName;
--- a/Common/nvMatrix.h
+++ b/Common/nvMatrix.h
@ -258,7 +258,7 @@ namespace nv
                s[2] = &r3[0];
                s[3] = &r4[0];

-                int i,j,p,jj;
+                register int i,j,p,jj;

                for (i=0; i<4; i++)
                {
--- a/Common/nvrtc_helper.h
+++ b/Common/nvrtc_helper.h
@ -187,7 +187,6 @@ CUmodule loadCUBIN(char *cubin, int argc, char **argv) {
  CUcontext context;
  int major = 0, minor = 0;
  char deviceName[256];
-  CUctxCreateParams ctxCreateParams = {};

  // Picks the best CUDA device available
  CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
@ -201,7 +200,7 @@ CUmodule loadCUBIN(char *cubin, int argc, char **argv) {
  printf("> GPU Device has SM %d.%d compute capability\n", major, minor);

  checkCudaErrors(cuInit(0));
-  checkCudaErrors(cuCtxCreate(&context, &ctxCreateParams, 0, cuDevice));
+  checkCudaErrors(cuCtxCreate(&context, 0, cuDevice));

  checkCudaErrors(cuModuleLoadData(&module, cubin));
  free(cubin);
--- a/Common/rendercheck_d3d10.cpp
+++ b/Common/rendercheck_d3d10.cpp
@ -0,0 +1,128 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+//
+//  Utility funcs to wrap up saving a surface or the back buffer as a PPM file
+//  In addition, wraps up a threshold comparision of two PPMs.
+//
+//  These functions are designed to be used to implement an automated QA testing
+//  for SDK samples.
+//
+//  Author: Bryan Dudash
+//  Email: sdkfeedback@nvidia.com
+//
+// Copyright (c) NVIDIA Corporation. All rights reserved.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <helper_functions.h>
+#include <rendercheck_d3d10.h>
+
+HRESULT CheckRenderD3D10::ActiveRenderTargetToPPM(ID3D10Device *pDevice,
+                                                  const char *zFileName) {
+  ID3D10RenderTargetView *pRTV = NULL;
+  pDevice->OMGetRenderTargets(1, &pRTV, NULL);
+
+  ID3D10Resource *pSourceResource = NULL;
+  pRTV->GetResource(&pSourceResource);
+
+  return ResourceToPPM(pDevice, pSourceResource, zFileName);
+}
+
+HRESULT CheckRenderD3D10::ResourceToPPM(ID3D10Device *pDevice,
+                                        ID3D10Resource *pResource,
+                                        const char *zFileName) {
+  D3D10_RESOURCE_DIMENSION rType;
+  pResource->GetType(&rType);
+
+  if (rType != D3D10_RESOURCE_DIMENSION_TEXTURE2D) {
+    printf("SurfaceToPPM: pResource is not a 2D texture! Aborting...\n");
+    return E_FAIL;
+  }
+
+  ID3D10Texture2D *pSourceTexture = (ID3D10Texture2D *)pResource;
+  ID3D10Texture2D *pTargetTexture = NULL;
+
+  D3D10_TEXTURE2D_DESC desc;
+  pSourceTexture->GetDesc(&desc);
+  desc.BindFlags = 0;
+  desc.CPUAccessFlags = D3D10_CPU_ACCESS_READ;
+  desc.Usage = D3D10_USAGE_STAGING;
+
+  if (FAILED(pDevice->CreateTexture2D(&desc, NULL, &pTargetTexture))) {
+    printf(
+        "SurfaceToPPM: Unable to create target Texture resoruce! Aborting... "
+        "\n");
+    return E_FAIL;
+  }
+
+  pDevice->CopyResource(pTargetTexture, pSourceTexture);
+
+  D3D10_MAPPED_TEXTURE2D mappedTex2D;
+  pTargetTexture->Map(0, D3D10_MAP_READ, 0, &mappedTex2D);
+
+  // Need to convert from dx pitch to pitch=width
+  unsigned char *pPPMData = new unsigned char[desc.Width * desc.Height * 4];
+
+  for (unsigned int iHeight = 0; iHeight < desc.Height; iHeight++) {
+    memcpy(
+        &(pPPMData[iHeight * desc.Width * 4]),
+        (unsigned char *)(mappedTex2D.pData) + iHeight * mappedTex2D.RowPitch,
+        desc.Width * 4);
+  }
+
+  pTargetTexture->Unmap(0);
+
+  // Prepends the PPM header info and bumps byte data afterwards
+  sdkSavePPM4ub(zFileName, pPPMData, desc.Width, desc.Height);
+
+  delete[] pPPMData;
+  pTargetTexture->Release();
+
+  return S_OK;
+}
+
+bool CheckRenderD3D10::PPMvsPPM(const char *src_file, const char *ref_file,
+                                const char *exec_path, const float epsilon,
+                                const float threshold) {
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf(
+        "CheckRenderD3D10::PPMvsPPM unable to find <%s> in <%s> Aborting "
+        "comparison!\n",
+        ref_file, exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           ref_file);
+    printf("Aborting comparison!\n");
+    printf("  FAILURE!\n");
+    return false;
+  }
+
+  return (sdkComparePPM(src_file, ref_file_path, epsilon, threshold, true) ==
+          true);
+}
--- a/Common/rendercheck_d3d10.h
+++ b/Common/rendercheck_d3d10.h
@ -0,0 +1,53 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#ifndef _RENDERCHECK_D3D10_H_
+#define _RENDERCHECK_D3D10_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <d3d10.h>
+
+class CheckRenderD3D10 {
+ public:
+  CheckRenderD3D10() {}
+
+  static HRESULT ActiveRenderTargetToPPM(ID3D10Device *pDevice,
+                                         const char *zFileName);
+  static HRESULT ResourceToPPM(ID3D10Device *pDevice, ID3D10Resource *pResource,
+                               const char *zFileName);
+
+  static bool PPMvsPPM(const char *src_file, const char *ref_file,
+                       const char *exec_path, const float epsilon,
+                       const float threshold = 0.0f);
+};
+
+#endif
--- a/Common/rendercheck_d3d9.cpp
+++ b/Common/rendercheck_d3d9.cpp
@ -0,0 +1,167 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+//
+//  Utility funcs to wrap up savings a surface or the back buffer as a PPM file
+//  In addition, wraps up a threshold comparision of two PPMs.
+//
+//  These functions are designed to be used to implement an automated QA testing
+//  for SDK samples.
+//
+//  Author: Bryan Dudash
+//  Email: sdkfeedback@nvidia.com
+//
+// Copyright (c) NVIDIA Corporation. All rights reserved.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <helper_functions.h>
+#include <rendercheck_d3d9.h>
+
+// originally copied from checkrender_gl.cpp and slightly modified
+bool CheckRenderD3D9::PPMvsPPM(const char *src_file, const char *ref_file,
+                               const char *exec_path, const float epsilon,
+                               const float threshold) {
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf(
+        "CheckRenderD3D9::PPMvsPPM unable to find <%s> in <%s> Aborting "
+        "comparison!\n",
+        ref_file, exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           ref_file);
+    printf("Aborting comparison!\n");
+    printf("  FAILURE!\n");
+    return false;
+  }
+
+  return (sdkComparePPM(src_file, ref_file_path, epsilon, threshold, true) ==
+          true);
+};
+
+HRESULT CheckRenderD3D9::BackbufferToPPM(IDirect3DDevice9 *pDevice,
+                                         const char *zFileName) {
+  IDirect3DSurface9 *pSurface = NULL;
+
+  if (FAILED(
+          pDevice->GetBackBuffer(0, 0, D3DBACKBUFFER_TYPE_MONO, &pSurface))) {
+    printf("Unable to get the back buffer.  Aborting...\n");
+    return E_FAIL;
+  }
+
+  // D3DXSaveSurfaceToFile("C:\\bing.dds",D3DXIFF_DDS,pSurface,NULL,NULL);
+
+  HRESULT hr = S_OK;
+  hr = SurfaceToPPM(pDevice, pSurface, zFileName);
+
+  pSurface->Release();
+
+  return hr;
+}
+
+HRESULT CheckRenderD3D9::SurfaceToPPM(IDirect3DDevice9 *pDevice,
+                                      IDirect3DSurface9 *pSurface,
+                                      const char *zFileName) {
+  D3DSURFACE_DESC pDesc;
+  pSurface->GetDesc(&pDesc);
+
+  // $$ For now only support common 8bit formats.  TODO: support for more
+  // complex formats via conversion?
+  if (!(pDesc.Format == D3DFMT_A8R8G8B8 || pDesc.Format == D3DFMT_X8R8G8B8)) {
+    return E_INVALIDARG;
+  }
+
+  IDirect3DTexture9 *pTargetTex = NULL;
+
+  if (FAILED(pDevice->CreateTexture(pDesc.Width, pDesc.Height, 1,
+                                    D3DUSAGE_DYNAMIC, pDesc.Format,
+                                    D3DPOOL_SYSTEMMEM, &pTargetTex, NULL))) {
+    printf("Unable to create texture for surface transfer! Aborting...\n");
+    return E_FAIL;
+  }
+
+  IDirect3DSurface9 *pTargetSurface = NULL;
+
+  if (FAILED(pTargetTex->GetSurfaceLevel(0, &pTargetSurface))) {
+    printf("Unable to get surface for surface transfer! Aborting...\n");
+    return E_FAIL;
+  }
+
+  // This is required because we cannot lock a D3DPOOL_DEAULT surface directly.
+  // So, we copy to our sysmem surface.
+  if (FAILED(pDevice->GetRenderTargetData(pSurface, pTargetSurface))) {
+    printf(
+        "Unable to GetRenderTargetData() for surface transfer! Aborting...\n");
+    return E_FAIL;
+  }
+
+  D3DLOCKED_RECT lockedRect;
+  HRESULT hr = pTargetSurface->LockRect(&lockedRect, NULL, 0);
+
+  // Need to convert from dx pitch to pitch=width
+  //
+  // $ PPM is BGR and not RGB it seems. Saved image looks "funny" in viewer(red
+  // and blue swapped), but since ref will be dumped using same method, this is
+  // ok.
+  //      however, if we want the saved image to be properly colored, then we
+  //      can swizzle the color bytes here.
+  unsigned char *pPPMData = new unsigned char[pDesc.Width * pDesc.Height * 4];
+
+  for (unsigned int iHeight = 0; iHeight < pDesc.Height; iHeight++) {
+#if 1  // swizzle to implment RGB to BGR conversion.
+
+    for (unsigned int iWidth = 0; iWidth < pDesc.Width; iWidth++) {
+      DWORD color = *(DWORD *)((unsigned char *)(lockedRect.pBits) +
+                               iHeight * lockedRect.Pitch + iWidth * 4);
+
+      // R<->B, [7:0] <-> [23:16], swizzle
+      color = ((color & 0xFF) << 16) | (color & 0xFF00) |
+              ((color & 0xFF0000) >> 16) | (color & 0xFF000000);
+
+      memcpy(&(pPPMData[(iHeight * pDesc.Width + iWidth) * 4]),
+             (unsigned char *)&color, 4);
+    }
+
+#else
+    memcpy(&(pPPMData[iHeight * pDesc.Width * 4]),
+           (unsigned char *)(lockedRect.pBits) + iHeight * lockedRect.Pitch,
+           pDesc.Width * 4);
+#endif
+  }
+
+  pTargetSurface->UnlockRect();
+
+  // Prepends the PPM header info and bumps byte data afterwards
+  sdkSavePPM4ub(zFileName, pPPMData, pDesc.Width, pDesc.Height);
+
+  delete[] pPPMData;
+  pTargetSurface->Release();
+  pTargetTex->Release();
+
+  return S_OK;
+}
--- a/Common/rendercheck_d3d9.h
+++ b/Common/rendercheck_d3d9.h
@ -0,0 +1,54 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#ifndef _RENDERCHECK_D3D9_H_
+#define _RENDERCHECK_D3D9_H_
+
+#include <assert.h>
+#include <d3d9.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+class CheckRenderD3D9 {
+ public:
+  CheckRenderD3D9() {}
+
+  static HRESULT BackbufferToPPM(IDirect3DDevice9 *pDevice,
+                                 const char *zFileName);
+  static HRESULT SurfaceToPPM(IDirect3DDevice9 *pDevice,
+                              IDirect3DSurface9 *pSurface,
+                              const char *zFileName);
+
+  static bool PPMvsPPM(const char *src_file, const char *ref_file,
+                       const char *exec_path, const float epsilon,
+                       const float threshold = 0.0f);
+};
+
+#endif
--- a/Samples/7_libNVVM/syscalls/CMakeLists.txt
+++ b/Samples/7_libNVVM/syscalls/CMakeLists.txt
@ -1,4 +1,6 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+###############################################################################
+#
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@ -23,32 +25,45 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+###############################################################################
+#
+# CUDA Samples
+#
+###############################################################################

+TARGET_ARCH ?= $(shell uname -m)

-add_test(NAME test-syscalls-malloc-free
-	COMMAND "${CMAKE_CURRENT_BINARY_DIR}/../ptxgen/ptxgen" "${CMAKE_CURRENT_SOURCE_DIR}/malloc-free.ll"
-  WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
+# Project folders that contain CUDA samples
+PROJECTS ?= $(shell find Samples -name Makefile)

-add_test(NAME test-syscalls-vprintf
-	COMMAND "${CMAKE_CURRENT_BINARY_DIR}/../ptxgen/ptxgen" "${CMAKE_CURRENT_SOURCE_DIR}/vprintf.ll"
-  WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
+FILTER_OUT :=

-set_tests_properties(test-syscalls-vprintf test-syscalls-malloc-free
-                     PROPERTIES FIXTURES_REQUIRED PTXGENTEST)
+PROJECTS := $(filter-out $(FILTER_OUT),$(PROJECTS))

-# Install to CUDA_SAMPLES_INSTALL_DIR if defined (for unified installation),
-# otherwise install to bin/syscalls (for standalone libNVVM build)
-if(DEFINED CUDA_SAMPLES_INSTALL_DIR)
-    install(FILES malloc-free.ll DESTINATION ${CUDA_SAMPLES_INSTALL_DIR}/syscalls)
-    install(FILES vprintf.ll DESTINATION ${CUDA_SAMPLES_INSTALL_DIR}/syscalls)
-else()
-    install(FILES malloc-free.ll DESTINATION bin/syscalls)
-    install(FILES vprintf.ll DESTINATION bin/syscalls)
-endif()
+%.ph_build :
+	+@$(MAKE) -C $(dir $*) $(MAKECMDGOALS)

-file(COPY malloc-free.ll DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
-file(COPY vprintf.ll DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
+%.ph_test :
+	+@$(MAKE) -C $(dir $*) testrun

-# Copy the .ll files to the folder of executable file for full testing
-file(COPY malloc-free.ll DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/../ptxgen/syscalls)
-file(COPY vprintf.ll DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/../ptxgen/syscalls)
+%.ph_clean : 
+	+@$(MAKE) -C $(dir $*) clean $(USE_DEVICE)
+
+%.ph_clobber :
+	+@$(MAKE) -C $(dir $*) clobber $(USE_DEVICE)
+
+all:  $(addsuffix .ph_build,$(PROJECTS))
+	@echo "Finished building CUDA samples"
+
+build: $(addsuffix .ph_build,$(PROJECTS))
+
+test : $(addsuffix .ph_test,$(PROJECTS))
+
+tidy:
+	@find * | egrep "#" | xargs rm -f
+	@find * | egrep "\~" | xargs rm -f
+
+clean: tidy $(addsuffix .ph_clean,$(PROJECTS))
+
+clobber: clean $(addsuffix .ph_clobber,$(PROJECTS))
--- a/README.md
+++ b/README.md
@ -1,20 +1,22 @@
 # CUDA Samples

-Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 13.1](https://developer.nvidia.com/cuda-downloads).
+Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.1](https://developer.nvidia.com/cuda-downloads).

 ## Release Notes

 This section describes the release notes for the CUDA Samples on GitHub only.

-### Change Log
+### CUDA 12.1
+* Added JIT LTO Sample
+* Adding Large Kernel Sample

-### [Revision History](./CHANGELOG.md)
+### [older versions...](./CHANGELOG.md)

 ## Getting Started

 ### Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).

 ### Getting the CUDA Samples
@ -28,400 +30,43 @@ Without using git the easiest way to use these samples is to download the zip fi

 ## Building CUDA Samples

-### Building CUDA Samples
-
-The CUDA Samples are built using CMake. Follow the instructions below for building on Linux, Windows, and for cross-compilation to Tegra devices.
-
-### Linux
-
-Ensure that CMake (version 3.20 or later) is installed. Install it using your package manager if necessary:
-
-e.g.
-```sudo apt install cmake```
-
-Navigate to the root of the cloned repository and create a build directory:
-```
-mkdir build && cd build
-```
-Configure the project with CMake:
-```
-cmake ..
-```
-Build the samples:
-```
-make -j$(nproc)
-```
-Run the samples from their respective directories in the build folder. You can also follow this process from and subdirectory of the samples repo, or from within any individual sample.
-
 ### Windows

-Language services for CMake are available in Visual Studio 2019 version 16.5 or later, and you can directly import the CUDA samples repository from either the root level or from any
-subdirectory or individual sample.
-
-To build from the command line, open the `x64 Native Tools Command Prompt for VS` provided with your Visual Studio installation.
-
-Navigate to the root of the cloned repository and create a build directory:
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
 ```
-mkdir build && cd build
+*_vs<version>.sln - for Visual Studio <version>
 ```
-Configure the project with CMake - for example:
+Complete samples solution files exist at parent directory of the repo:
+
+Each individual sample has its own set of solution files at:
+`<CUDA_SAMPLES_REPO>\Samples\<sample_dir>\`
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
-cmake .. -G "Visual Studio 16 2019" -A x64
+$ cd <sample_dir>
+$ make
 ```
-Open the generated solution file CUDA_Samples.sln in Visual Studio. Build the samples by selecting the desired configuration (e.g., Debug or Release) and pressing F7 (Build Solution).
-
-Run the samples from the output directories specified in Visual Studio.
-
-### Enabling On-GPU Debugging
-
-NVIDIA GPUs support on-GPU debugging through cuda-gdb. Enabling this may significantly affect application performance as certain compiler optimizations are disabled
-in this configuration, hence it's not on by default. Enablement of on-device debugging is controlled via the `-G` switch to nvcc.
-
-To enable cuda-gdb for samples builds, define the `ENABLE_CUDA_DEBUG` flag on the CMake command line. For example:
-
-```
-cmake -DENABLE_CUDA_DEBUG=True ...
-```
-
-### Platform-Specific Samples
-
-Some CUDA samples are specific to certain platforms, and require passing flags into CMake to enable. In particular, we define the following platform-specific flags:
-
-* `BUILD_TEGRA` - for Tegra-specific samples
-
-To build these samples, set the variables either on the command line or through your CMake GUI. For example:
-
-```
-cmake -DBUILD_TEGRA=True ..
-```
-
-### Cross-Compilation for Tegra Platforms
-
-Install the NVIDIA toolchain and cross-compilation environment for Tegra devices as described in the Tegra Development Guide.
-
-Ensure that CMake (version 3.20 or later) is installed.
-
-Navigate to the root of the cloned repository and create a build directory:
-```
-mkdir build && cd build
-```
-Configure the project with CMake, specifying the Tegra toolchain file. And you can use -DTARGET_FS to point to the target file system root path for necessary include and library files:
-```
-cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/toolchain-aarch64-linux.cmake -DTARGET_FS=/path/to/target/system/file/system
-```
-Build the samples:
-```
-make -j$(nproc)
-```
-Transfer the built binaries to the Tegra device and execute them there.
-
-
-### Cross Building for Automotive Linux Platforms from the DriveOS Docker containers
-
-To build CUDA samples to the target platform from the DriveOS Docker containers, use the following instructions.
-
-Mount the target Root Filesystem (RFS) in the container so that the CUDA cmake process has the correct paths to CUDA and other system libraries required to build the samples.
-
-Create a temporary directory, `<temp>` is any temporary directory of your choosing, for example, you can use `/drive/temp`:
-
-```
-$ mkdir /drive/<temp>
-```
-
-Mount the filesystem by running the following command:
-
-```
-$ mount /drive/drive-linux/filesystem/targetfs-images/dev_nsr_desktop_ubuntu-24.04_thor_rfs.img /drive/temp
-```
-
-Configure the project by running the following cmake command:
-
-```
-$ mkdir build && cd build
-$ cmake .. -DBUILD_TEGRA=True \
-  -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-  -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/toolchain-aarch64-linux.cmake \
-  -DTARGET_FS=/drive/temp \
-  -DCMAKE_LIBRARY_PATH=/drive/temp/usr/local/cuda-13.1/thor/lib64/ \
-  -DCMAKE_INCLUDE_PATH=/drive/temp/usr/local/cuda-13.1/thor/include/
-```
-
-Please note that the following libraries are not pre-installed in the DriveOS dev-nsr target filesystem:
-* libdrm-dev
-* Vulkan
-
-This causes the cmake command to throw errors related to the missing files, and as a result, the related samples will not build in later steps. This issue will be addressed in a future DriveOS release.
-
-To build the samples with ignore the error mentioned above, you can use `--ignore-errors`/`--keep-going` or comment out the comment out the corresponding `add_subdirectory` command in the CMakeLists.txt in the parent folder for the samples requiring Vulkan and libdrm_dev:
-
-```
-$ make -j$(nproc) --ignore-errors # or --keep-going
-```
-
-```
-# In Samples/5_Domain_Specific/CMakeList.txt
-# add_subdirectory(simpleGL)
-# add_subdirectory(simpleVulkan)
-# add_subdirectory(simpleVulkanMMAP)
-
-# In Samples/8_Platform_Specific/Tegra/CMakeList.txt
-# add_subdirectory(simpleGLES_EGLOutput)
-```
-
-### QNX
-
-Cross-compilation for QNX with CMake is supported in the CUDA 13.0 samples release and newer. An example build for
-the Tegra Thor QNX platform might look like this:
-
-```
-$ mkdir build
-$ cd build
-
-QNX_HOST=/path/to/qnx/host \
-QNX_TARGET=/path/to/qnx/target \
-cmake .. \
-DBUILD_TEGRA=True \
-DCMAKE_CUDA_COMPILER=/usr/local/cuda-safe-13.0/bin/nvcc \
-DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/toolchain-aarch64-qnx.cmake \
-DCMAKE_LIBRARY_PATH=/usr/local/cuda-safe-13.0/thor/targets/aarch64-qnx/lib/stubs/ \
-DCMAKE_INCLUDE_PATH=/usr/local/cuda-safe-13.0/thor/targets/aarch64-qnx/include/
-```
-
-### Forward Compatibility
-
-To build samples with new CUDA Toolkit(CUDA 13.0 or later) and UMD(Version 580 or later) and old KMD(Version 550 or earlier)，you need to set the `CMAKE_PREFIX_PATH` for using new driver library, the command might like this:
-
-```
-cmake -DCMAKE_PREFIX_PATH=/usr/local/cuda/lib64/stubs/ ..
-```
-
-## Install Samples
-
-### Installation Path Structure
-
-The installation system automatically organizes samples into a structured directory layout based on:
- **Target Architecture**: ${CMAKE_SYSTEM_PROCESSOR}, e.g. `x64`, `aarch64`, `amd64`, etc.
- **Target OS**: `linux`, `windows`, `darwin`, `qnx`
- **Build Type**: `release`, `debug`, etc.
-
-The default installation path is: `build/bin/${TARGET_ARCH}/${TARGET_OS}/${BUILD_TYPE}`
-
-**Examples:**
- Linux x86_64 Release: `build/bin/x64/linux/release`
- Linux aarch64 Release: `build/bin/aarch64/linux/release`
- Windows amd64 Release: `build/bin/amd64/windows/release`
-
-### Customizing Installation Paths
-
-You can customize the installation location using CMake variables during the configuration step:
-
- `CMAKE_INSTALL_PREFIX`: Changes the root installation directory (default: `build/bin`)
-  ```
-  cmake -DCMAKE_INSTALL_PREFIX=/custom/path ..
-  ```
-  This will install to: `/custom/path/${TARGET_ARCH}/${TARGET_OS}/${BUILD_TYPE}`
-
- `CUDA_SAMPLES_INSTALL_DIR`: Specifies the exact final installation directory (overrides the structured path)
-  ```
-  cmake -DCUDA_SAMPLES_INSTALL_DIR=/exact/install/path ..
-  ```
-
-### Install Samples on Linux
-
-**Prerequisites:** You must first configure the project with CMake as described in the [Building CUDA Samples - Linux](#linux) or [Building]section.
-
-After configuring and building, install the samples:
-
-```
-cd build/
-make install
-```
-
-### Install Samples on Windows
-
-**Prerequisites:** You must first configure the project with CMake as described in the [Building CUDA Samples - Windows](#windows) section.
-
-#### Using Command Line
-
-After configuring with CMake, build and install from the `x64 Native Tools Command Prompt for VS`:
-
-```cmd
-cd build
-cmake --build . --config Release
-cmake --install . --config Release
-```
-
-**Note:** Replace `Release` with `Debug` if you want to install debug builds. For multi-configuration generators (like Visual Studio), the `--config` flag determines which build type to install.
-
-#### Using Visual Studio IDE
-
-Alternatively, open the generated solution file `CUDA_Samples.sln` in Visual Studio:
-1. Select the desired configuration (`Release` or `Debug`)
-2. Build the solution (F7 or Build > Build Solution)
-3. Right-click on the `INSTALL` target under `CMakePredefinedTargets` in Solution Explorer
-4. Select "Build"
-
-## Running All Samples as Tests
-
-It's important to note that the CUDA samples are _not_ intended as a validation suite for CUDA. They do not cover corner cases, they do not completely cover the
-runtime and driver APIs, are not intended for performance benchmarking, etc. That said, it can sometimes be useful to run all of the samples as a quick sanity check and
-we provide a script to do so, `run_tests.py`.
-
-This Python3 script finds all executables in a subdirectory you choose, matching application names with command line arguments specified in `test_args.json`. It accepts
-the following command line arguments:
-
-| Switch     | Purpose                                                                                                        | Example                 |
-| ---------- | -------------------------------------------------------------------------------------------------------------- | ----------------------- |
-| --dir      | Specify the root directory to search for executables (recursively)                                             | --dir ./build/Samples   |
-| --config   | JSON configuration file for executable arguments                                                               | --config test_args.json |
-| --output   | Output directory for test results (stdout saved to .txt files - directory will be created if it doesn't exist) | --output ./test         |
-| --args     | Global arguments to pass to all executables (not currently used)                                               | --args arg_1 arg_2 ...  |
-| --parallel | Number of applications to execute in parallel.                                                                 | --parallel 8            |
-
-
-Application configurations are loaded from `test_args.json` and matched against executable names (discarding the `.exe` extension on Windows).
-
-The script returns 0 on success, or the first non-zero error code encountered during testing on failure. It will also print a condensed list of samples that failed, if any.
-
-There are three primary modes of configuration:
-
-**Skip**
-
-An executable configured with "skip" will not be executed. These generally rely on having attached graphical displays and are not suited to this kind of automation.
-
-Configuration example:
-```json
-"fluidsGL": {
-    "skip": true
-}
-```
-
-You will see:
-```
-Skipping fluidsGL (marked as skip in config)
-```
-
-**Single Run**
-
-For executables to run one time only with arguments, specify each argument as a list entry. Each entry in the JSON file will be appended to the command line, separated
-by a space.
-
-All applications execute from their current directory, so all paths are relative to the application's location.
-
-Note that if an application needs no arguments, this entry is optional. An executable found without a matching entry in the JSON will just run as `./application` from its
-current directory.
-
-Configuration example:
-```json
-"ptxgen": {
-    "args": [
-        "test.ll",
-        "-arch=compute_75"
-    ]
-}
-```
-
-You will see:
-```
-Running ptxgen
-    Command: ./ptxgen test.ll -arch=compute_75
-    Test completed with return code 0
-```
-
-**Multiple Runs**
-
-For executables to run multiple times with different command line arguments, specify any number of sets of args within a "runs" list.
-
-As with single runs, all applications execute from their current directory, so all paths are relative to the application's location.
-
-Configuration example:
-```json
-"recursiveGaussian": {
-    "runs": [
-        {
-            "args": [
-                "-sigma=10",
-                "-file=data/ref_10.ppm"
-            ]
-        },
-        {
-            "args": [
-                "-sigma=14",
-                "-file=data/ref_14.ppm"
-            ]
-        },
-        {
-            "args": [
-                "-sigma=18",
-                "-file=data/ref_18.ppm"
-            ]
-        },
-        {
-            "args": [
-                "-sigma=22",
-                "-file=data/ref_22.ppm"
-            ]
-        }
-    ]
-}
-```
-
-You will see:
-```
-Running recursiveGaussian (run 1/4)
-    Command: ./recursiveGaussian -sigma=10 -file=data/ref_10.ppm
-    Test completed with return code 0
-Running recursiveGaussian (run 2/4)
-    Command: ./recursiveGaussian -sigma=14 -file=data/ref_14.ppm
-    Test completed with return code 0
-Running recursiveGaussian (run 3/4)
-    Command: ./recursiveGaussian -sigma=18 -file=data/ref_18.ppm
-    Test completed with return code 0
-Running recursiveGaussian (run 4/4)
-    Command: ./recursiveGaussian -sigma=22 -file=data/ref_22.ppm
-    Test completed with return code 0
-```
-
-### Example Usage
-
-Here is an example set of commands to build and test all of the samples.
-
-First, build:
-```bash
-mkdir build
-cd build
-cmake ..
-make -j$(nproc)
-```
-
-Now, return to the samples root directory and run the test script:
-```bash
-cd ..
-python3 run_tests.py --output ./test --dir ./build/Samples --config test_args.json
-```
-
-If all applications run successfully, you will see something similar to this (the specific number of samples will depend on your build type
-and system configuration):
-
-```
-Test Summary:
-Ran 199 test runs for 180 executables.
-All test runs passed!
-```
-
-If some samples fail, you will see something like this:
-
-```
-Test Summary:
-Ran 199 test runs for 180 executables.
-Failed runs (2):
-  bicubicTexture (run 1/5): Failed (code 1)
-  Mandelbrot (run 1/2): Failed (code 1)
-```
-
-You can inspect the stdout logs in the output directory (generally `APM_<application_name>.txt` or `APM_<application_name>.run<n>.txt`) to help
-determine what may have gone wrong from the output logs. Please file issues against the samples repository if you believe a sample is failing
-incorrectly on your system.
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details on cross platform compilation of cuda samples.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+    ```
+    $ make HOST_COMPILER=g++
+    ```

 ## Samples list

@ -446,12 +91,6 @@ Samples that are specific to domain (Graphics, Finance, Image Processing).
 ### [6. Performance](./Samples/6_Performance/README.md)
 Samples that demonstrate performance optimization.

-### [7. libNVVM](./Samples/7_libNVVM/README.md)
-Samples that demonstrate the use of libNVVVM and NVVM IR.
-
-### [8. Platform Specific](./Samples/8_Platform_Specific/Tegra/README.md)
-Samples that are specific to certain platforms (Tegra, cuDLA, NvMedia, NvSci, OpenGL ES).
-
 ## Dependencies

 Some CUDA Samples rely on third-party applications and/or libraries, or features provided by the CUDA Toolkit and Driver, to either build or execute. These dependencies are listed below.
@ -468,7 +107,7 @@ These third-party dependencies are required by some CUDA samples. If available,

 FreeImage is an open source imaging library. FreeImage can usually be installed on Linux using your distribution's package manager system. FreeImage can also be downloaded from the FreeImage website.

-To set up FreeImage on a Windows system, extract the FreeImage DLL distribution into the folder `./Common/FreeImage/Dist/x64` such that it contains the .h and .lib files. Copy the .dll file to the Release/ Debug/ execution folder or pass the FreeImage folder when cmake configuring with the `-DFreeImage_INCLUDE_DIR` and `-DFreeImage_LIBRARY` options.
+To set up FreeImage on a Windows system, extract the FreeImage DLL distribution into the folder `../../../Common/FreeImage/Dist/x64` such that it contains the .h and .lib files. Copy the .dll file to root level `bin/win64/Debug` and `bin/win64/Release` folder.

 #### Message Passing Interface

@ -494,27 +133,13 @@ OpenGL is a graphics library used for 2D and 3D rendering. On systems which supp

 OpenGL ES is an embedded systems graphics library used for 2D and 3D rendering. On systems which support OpenGL ES, NVIDIA's OpenGL ES implementation is provided with the CUDA Driver.

-#### Freeglut
-
-Freeglut is an open-source software library that serves as a replacement for the original OpenGL Utility Toolkit (GLUT). Its primary purpose is to make it easier for developers to create and manage windows containing OpenGL contexts, as well as handle input from devices like the mouse, keyboard, and joystick, across a wide range of platforms. To set up Freeglut on a Windowson on ARM system, you need to download the source from [Freeglut website](https://freeglut.sourceforge.net/), build freeglut on your system, and copy the freeglut.lib into the folder `./Common/lib/x64` and copy the freeglut.dll file into the `./bin/win64/${BUILD_TYPE}` execution folder.
-
 #### Vulkan

 Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan targets high-performance realtime 3D graphics applications such as video games and interactive media across all platforms. On systems which support Vulkan, NVIDIA's Vulkan implementation is provided with the CUDA Driver. For building and running Vulkan applications one needs to install the [Vulkan SDK](https://www.lunarg.com/vulkan-sdk/).

-#### GLEW
-
-GLEW (OpenGL Extension Wrangler Library) is a cross-platform, open-source C/C++ library designed to simplify the process of using modern OpenGL features and extensions. Its main function is to dynamically load OpenGL function pointers at runtime, allowing developers to access both core OpenGL functions and additional features provided by hardware vendors, known as extensions. To set up GLEW on a Windows on ARM system, you need to download the source from [GLEW website](https://glew.sourceforge.net/), build GLEW on your system, and copy the glew32.lib into the folder `./Common/lib/x64` and the glew32.dll into the `./bin/win64/${BUILD_TYPE}` execution folder.
-
-#### GLFW
-
-GLFW is a lightweight, open-source library designed for managing OpenGL, OpenGL ES, and Vulkan contexts. It simplifies the process of creating and managing windows, handling user input (keyboard, mouse, and joystick), and working with multiple monitors in a cross-platform manner.
-
-To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header folder as `-DGLFW_INCLUDE_DIR` and lib folder as `-DGLFW_LIB_DIR` for cmake configuring.
-
 #### OpenMP

-OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/). For compilers such as clang, `libomp.so` and other components for LLVM must be installed separated. You will also need to set additional flags in your CMake configuration files, such as: `-DOpenMP_CXX_FLAGS="-fopenmp=libomp" -DOpenMP_CXX_LIB_NAMES="omp" -DOpenMP_omp_LIBRARY="/path/to/libomp.so"`.
+OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/).

 #### Screen

@ -620,10 +245,6 @@ FP16 is a 16-bit floating-point format. One bit is used for the sign, five bits

 NVCC support of [C++11 features](https://en.wikipedia.org/wiki/C++11).

-#### CMake
-
-The libNVVM samples are built using [CMake](https://cmake.org/) 3.10 or later.
-
 ## Contributors Guide

 We welcome your input on issues and suggestions for samples. At this time we are not accepting contributions from the public, check back here as we evolve our contribution model.
--- a/Samples/0_Introduction/CMakeLists.txt
+++ b/Samples/0_Introduction/CMakeLists.txt
@ -1,46 +0,0 @@
-add_subdirectory(UnifiedMemoryStreams)
-add_subdirectory(asyncAPI)
-add_subdirectory(clock)
-add_subdirectory(clock_nvrtc)
-add_subdirectory(cudaOpenMP)
-add_subdirectory(fp16ScalarProduct)
-add_subdirectory(matrixMul)
-add_subdirectory(matrixMulDrv)
-add_subdirectory(matrixMulDynlinkJIT)
-add_subdirectory(matrixMul_nvrtc)
-add_subdirectory(mergeSort)
-add_subdirectory(simpleAWBarrier)
-add_subdirectory(simpleAssert)
-add_subdirectory(simpleAssert_nvrtc)
-add_subdirectory(simpleAtomicIntrinsics)
-add_subdirectory(simpleAtomicIntrinsics_nvrtc)
-add_subdirectory(simpleAttributes)
-add_subdirectory(simpleCUDA2GL)
-add_subdirectory(simpleCallback)
-add_subdirectory(simpleCooperativeGroups)
-add_subdirectory(simpleCubemapTexture)
-add_subdirectory(simpleDrvRuntime)
-add_subdirectory(simpleHyperQ)
-add_subdirectory(simpleIPC)
-add_subdirectory(simpleLayeredTexture)
-add_subdirectory(simpleMPI)
-add_subdirectory(simpleMultiCopy)
-add_subdirectory(simpleMultiGPU)
-add_subdirectory(simpleOccupancy)
-add_subdirectory(simpleP2P)
-add_subdirectory(simplePitchLinearTexture)
-add_subdirectory(simplePrintf)
-add_subdirectory(simpleStreams)
-add_subdirectory(simpleSurfaceWrite)
-add_subdirectory(simpleTemplates)
-add_subdirectory(simpleTexture)
-add_subdirectory(simpleTexture3D)
-add_subdirectory(simpleTextureDrv)
-add_subdirectory(simpleVoteIntrinsics)
-add_subdirectory(simpleZeroCopy)
-add_subdirectory(template)
-add_subdirectory(systemWideAtomics)
-add_subdirectory(vectorAdd)
-add_subdirectory(vectorAddDrv)
-add_subdirectory(vectorAddMMAP)
-add_subdirectory(vectorAdd_nvrtc)
--- a/Samples/0_Introduction/README.md
+++ b/Samples/0_Introduction/README.md
@ -4,12 +4,24 @@
 ### [asyncAPI](./asyncAPI)
 This sample illustrates the usage of CUDA events for both GPU timing and overlapping CPU and GPU execution. Events are inserted into a stream of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can perform computations while GPU is executing (including DMA memcopies between the host and device). CPU can query CUDA events to determine whether GPU has completed tasks.

+### [c++11_cuda](./c++11_cuda)
+This sample demonstrates C++11 feature support in CUDA. It scans a input text file and prints no. of occurrences of x, y, z, w characters. 
+
 ### [clock](./clock)
 This example shows how to use the clock function to measure the performance of block of threads of a kernel accurately.

 ### [clock_nvrtc](./clock_nvrtc)
 This example shows how to use the clock function using libNVRTC to measure the performance of block of threads of a kernel accurately.

+### [concurrentKernels](./concurrentKernels)
+This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.
+
+### [cppIntegration](./cppIntegration)
+This example demonstrates how to integrate CUDA into an existing C++ application, i.e. the CUDA entry point on host side is only a function which is called from C++ code and only the file containing this function is compiled with nvcc. It also demonstrates that vector types can be used from cpp.
+
+### [cppOverload](./cppOverload)
+This sample demonstrates how to use C++ function overloading on the GPU.
+
 ### [cudaOpenMP](./cudaOpenMP)
 This sample demonstrates how to use OpenMP API to write an application for multiple GPUs.

@ -94,6 +106,9 @@ Use of Pitch Linear Textures
 ### [simplePrintf](./simplePrintf)
 This basic CUDA Runtime API sample demonstrates how to use the printf function in the device code.

+### [simpleSeparateCompilation](./simpleSeparateCompilation)
+This sample demonstrates a CUDA 5.0 feature, the ability to create a GPU device static library and use it within another CUDA kernel.  This example demonstrates how to pass in a GPU device function (from the GPU device static library) as a function pointer to be called.  This sample requires devices with compute capability 2.0 or higher.
+
 ### [simpleStreams](./simpleStreams)
 This sample uses CUDA streams to overlap kernel executions with memory copies between the host and a GPU device.  This sample uses a new CUDA 4.0 feature that supports pinning of generic host memory.  Requires Compute Capability 2.0 or higher.

@ -103,6 +118,9 @@ Simple example that demonstrates the use of 2D surface references (Write-to-Text
 ### [simpleTemplates](./simpleTemplates)
 This sample is a templatized version of the template project. It also shows how to correctly templatize dynamically allocated shared memory arrays.

+### [simpleTemplates_nvrtc](./simpleTemplates_nvrtc)
+This sample is a templatized version of the template project. It also shows how to correctly templatize dynamically allocated shared memory arrays.
+
 ### [simpleTexture](./simpleTexture)
 Simple example that demonstrates use of Textures in CUDA.

@ -115,6 +133,9 @@ Simple example that demonstrates use of Textures in CUDA.  This sample uses the
 ### [simpleVoteIntrinsics](./simpleVoteIntrinsics)
 Simple program which demonstrates how to use the Vote (__any_sync, __all_sync) intrinsic instruction in a CUDA kernel.

+### [simpleVoteIntrinsics_nvrtc](./simpleVoteIntrinsics_nvrtc)
+Simple program which demonstrates how to use the Vote (any, all) intrinsic instruction in a CUDA kernel with runtime compilation using NVRTC APIs. Requires Compute Capability 2.0 or higher.
+
 ### [simpleZeroCopy](./simpleZeroCopy)
 This sample illustrates how to use Zero MemCopy, kernels can read and write directly to pinned system memory.

@ -138,3 +159,4 @@ This Vector Addition sample is a basic sample that is implemented element by ele

 ### [vectorAddMMAP](./vectorAddMMAP)
 This sample replaces the device allocation in the vectorAddDrv sample with cuMemMap-ed allocations.  This sample demonstrates that the cuMemMap api allows the user to specify the physical properties of their memory while retaining the contiguous nature of their access, thus not requiring a change in their program structure.
+
--- a/Samples/0_Introduction/UnifiedMemoryStreams/.vscode/tasks.json
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/.vscode/tasks.json
@ -0,0 +1,15 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "sample",
+            "type": "shell",
+            "command": "make dbg=1",
+            "problemMatcher": ["$nvcc"],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
--- a/Samples/0_Introduction/UnifiedMemoryStreams/CMakeLists.txt
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/CMakeLists.txt
@ -1,55 +0,0 @@
-cmake_minimum_required(VERSION 3.20)
-
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
-
-project(UnifiedMemoryStreams LANGUAGES C CXX CUDA)
-
-find_package(CUDAToolkit REQUIRED)
-
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-
-set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
-endif()
-
-# Include directories and libraries
-include_directories(../../../Common)
-
-# This sample is not supported on QNX
-if(CMAKE_SYSTEM_NAME STREQUAL "QNX")
-    message(STATUS "Will not build sample ${PROJECT_NAME} - not supported on QNX")
-    return()
-endif()
-
-# Source file
-if(CMAKE_GENERATOR MATCHES "Visual Studio")
-    find_package(OpenMP REQUIRED C CXX)
-else()
-    find_package(OpenMP REQUIRED)
-endif()
-
-if(${OpenMP_FOUND})
-    # Add target for UnifiedMemoryStreams
-    add_executable(UnifiedMemoryStreams UnifiedMemoryStreams.cu)
-
-target_compile_options(UnifiedMemoryStreams PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
-
-target_compile_features(UnifiedMemoryStreams PRIVATE cxx_std_17 cuda_std_17)
-
-    set_target_properties(UnifiedMemoryStreams PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
-    target_link_libraries(UnifiedMemoryStreams PUBLIC
-        CUDA::cublas
-        OpenMP::OpenMP_CXX
-    )
-else()
-    message(STATUS "OpenMP not found - will not build sample 'UnifiedMemoryStreams'")
-endif()
-
-# Include installation configuration
-include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
-setup_samples_install()
--- a/Samples/0_Introduction/UnifiedMemoryStreams/Makefile
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/Makefile
@ -0,0 +1,381 @@
+################################################################################
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += -D_QNX_SOURCE
+        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - UnifiedMemoryStreams is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Attempt to compile a minimal OpenMP application. If a.out exists, OpenMP is properly set up.
+ifneq (,$(filter $(TARGET_OS),linux android))
+
+ifneq (,$(filter $(TARGET_OS), android))
+     LIBRARIES += -lomp
+else
+     LIBRARIES += -lgomp
+endif
+
+ALL_CCFLAGS += -Xcompiler -fopenmp
+$(shell echo "#include <omp.h>" > test.c ; echo "int main() { omp_get_num_threads(); return 0; }" >> test.c ; $(HOST_COMPILER) -fopenmp test.c)
+OPENMP ?= $(shell find a.out 2>/dev/null)
+
+ifeq ($(OPENMP),)
+      $(info -----------------------------------------------------------------------------------------------)
+      $(info WARNING - OpenMP is unable to compile)
+      $(info -----------------------------------------------------------------------------------------------)
+      $(info   This CUDA Sample cannot be built if the OpenMP compiler is not set up correctly.)
+      $(info   This will be a dry-run of the Makefile.)
+      $(info   For more information on how to set up your environment to build and run this )
+      $(info   sample, please refer the CUDA Samples documentation and release notes)
+      $(info -----------------------------------------------------------------------------------------------)
+      SAMPLE_ENABLED := 0
+endif
+
+$(shell rm a.out test.c 2>/dev/null)
+else
+LIBRARIES += -lpthread
+ALL_CCFLAGS += -DUSE_PTHREADS
+endif
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
+SMS ?= 53 61 70 72 75 80 86 87 90
+else
+SMS ?= 50 52 60 61 70 75 80 86 89 90
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --threads 0 --std=c++11
+
+LIBRARIES += -lcublas
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: UnifiedMemoryStreams
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+UnifiedMemoryStreams.o:UnifiedMemoryStreams.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+UnifiedMemoryStreams: UnifiedMemoryStreams.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./UnifiedMemoryStreams
+
+testrun: build
+
+clean:
+	rm -f UnifiedMemoryStreams UnifiedMemoryStreams.o
+	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/UnifiedMemoryStreams
+
+clobber: clean
--- a/Samples/0_Introduction/UnifiedMemoryStreams/NsightEclipse.xml
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/NsightEclipse.xml
@ -0,0 +1,88 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>UnifiedMemoryStreams</name>
+  <cuda_api_list>
+    <toolkit>cudaStreamDestroy</toolkit>
+    <toolkit>cudaFree</toolkit>
+    <toolkit>cudaMallocManaged</toolkit>
+    <toolkit>cudaStreamAttachMemAsync</toolkit>
+    <toolkit>cudaSetDevice</toolkit>
+    <toolkit>cudaDeviceSynchronize</toolkit>
+    <toolkit>cudaStreamSynchronize</toolkit>
+    <toolkit>cudaStreamCreate</toolkit>
+    <toolkit>cudaGetDeviceProperties</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This sample demonstrates the use of OpenMP and streams with Unified Memory on a single GPU.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../../Common</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">CUDA Systems Integration</concept>
+    <concept level="basic">OpenMP</concept>
+    <concept level="basic">CUBLAS</concept>
+    <concept level="basic">Multithreading</concept>
+    <concept level="basic">Unified Memory</concept>
+    <concept level="basic">CUDA Streams and Events</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>CUBLAS</keyword>
+    <keyword>OpenMP</keyword>
+    <keyword>cluster</keyword>
+    <keyword>multi-GPU Support</keyword>
+    <keyword>Unified Memory</keyword>
+    <keyword>UVM</keyword>
+    <keyword>openMP</keyword>
+    <keyword>Streams</keyword>
+    <keyword>pthreads</keyword>
+  </keywords>
+  <libraries>
+    <library>cublas</library>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>UnifiedMemoryStreams.cu</primary_file>
+  <required_dependencies>
+    <dependency>OpenMP</dependency>
+    <dependency>UVM</dependency>
+    <dependency>CUBLAS</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>1:CUDA Systems Integration</scope>
+    <scope>1:Unified Memory</scope>
+  </scopes>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>sbsa</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <from>3.5</from>
+  </supported_sm_architectures>
+  <title>Unified Memory Streams</title>
+  <type>exe</type>
+</entry>
--- a/Samples/0_Introduction/UnifiedMemoryStreams/README.md
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/README.md
@ -16,7 +16,7 @@ Linux, Windows

 ## Supported CPU Architecture

-x86_64, armv7l
+x86_64, ppc64le, armv7l

 ## CUDA APIs involved

@ -28,7 +28,45 @@ cudaStreamDestroy, cudaFree, cudaMallocManaged, cudaStreamAttachMemAsync, cudaSe

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
 ## References (for more details)
+
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu
@ -31,10 +31,10 @@
 */

 // system includes
-#include <algorithm>
 #include <cstdio>
 #include <ctime>
 #include <vector>
+#include <algorithm>
 #ifdef USE_PTHREADS
 #include <pthread.h>
 #else
@ -51,289 +51,291 @@
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 // SRAND48 and DRAND48 don't exist on windows, but these are the equivalent
 // functions
-void   srand48(long seed) { srand((unsigned int)seed); }
+void srand48(long seed) { srand((unsigned int)seed); }
 double drand48() { return double(rand()) / RAND_MAX; }
 #endif

 const char *sSDKname = "UnifiedMemoryStreams";

 // simple task
-template <typename T> struct Task
-{
-    unsigned int size, id;
-    T           *data;
-    T           *result;
-    T           *vector;
+template <typename T>
+struct Task {
+  unsigned int size, id;
+  T *data;
+  T *result;
+  T *vector;

-    Task()
-        : size(0)
-        , id(0)
-        , data(NULL)
-        , result(NULL)
-        , vector(NULL) {};
-    Task(unsigned int s)
-        : size(s)
-        , id(0)
-        , data(NULL)
-        , result(NULL)
-    {
-        // allocate unified memory -- the operation performed in this example will
-        // be a DGEMV
-        checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
-        checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
-        checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
-        checkCudaErrors(cudaDeviceSynchronize());
+  Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){};
+  Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) {
+    // allocate unified memory -- the operation performed in this example will
+    // be a DGEMV
+    checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
+    checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
+    checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
+    checkCudaErrors(cudaDeviceSynchronize());
+  }
+
+  ~Task() {
+    // ensure all memory is deallocated
+    checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaFree(data));
+    checkCudaErrors(cudaFree(result));
+    checkCudaErrors(cudaFree(vector));
+  }
+
+  void allocate(const unsigned int s, const unsigned int unique_id) {
+    // allocate unified memory outside of constructor
+    id = unique_id;
+    size = s;
+    checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
+    checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
+    checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    // populate data with random elements
+    for (unsigned int i = 0; i < size * size; i++) {
+      data[i] = drand48();
    }

-    ~Task()
-    {
-        // ensure all memory is deallocated
-        checkCudaErrors(cudaDeviceSynchronize());
-        checkCudaErrors(cudaFree(data));
-        checkCudaErrors(cudaFree(result));
-        checkCudaErrors(cudaFree(vector));
-    }
-
-    void allocate(const unsigned int s, const unsigned int unique_id)
-    {
-        // allocate unified memory outside of constructor
-        id   = unique_id;
-        size = s;
-        checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
-        checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
-        checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
-        checkCudaErrors(cudaDeviceSynchronize());
-
-        // populate data with random elements
-        for (unsigned int i = 0; i < size * size; i++) {
-            data[i] = drand48();
-        }
-
-        for (unsigned int i = 0; i < size; i++) {
-            result[i] = 0.;
-            vector[i] = drand48();
-        }
+    for (unsigned int i = 0; i < size; i++) {
+      result[i] = 0.;
+      vector[i] = drand48();
    }
+  }
 };

 #ifdef USE_PTHREADS
-struct threadData_t
-{
-    int             tid;
-    Task<double>   *TaskListPtr;
-    cudaStream_t   *streams;
-    cublasHandle_t *handles;
-    int             taskSize;
+struct threadData_t {
+  int tid;
+  Task<double> *TaskListPtr;
+  cudaStream_t *streams;
+  cublasHandle_t *handles;
+  int taskSize;
 };

 typedef struct threadData_t threadData;
 #endif

 // simple host dgemv: assume data is in row-major format and square
-template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
-{
-    // rows
-    for (int i = 0; i < n; i++) {
-        result[i] *= beta;
+template <typename T>
+void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
+  // rows
+  for (int i = 0; i < n; i++) {
+    result[i] *= beta;

-        for (int j = 0; j < n; j++) {
-            result[i] += A[i * n + j] * x[j];
-        }
+    for (int j = 0; j < n; j++) {
+      result[i] += A[i * n + j] * x[j];
    }
+  }
 }

 // execute a single task on either host or device depending on size
 #ifdef USE_PTHREADS
-void *execute(void *inpArgs)
-{
-    threadData     *dataPtr = (threadData *)inpArgs;
-    cudaStream_t   *stream  = dataPtr->streams;
-    cublasHandle_t *handle  = dataPtr->handles;
-    int             tid     = dataPtr->tid;
+void *execute(void *inpArgs) {
+  threadData *dataPtr = (threadData *)inpArgs;
+  cudaStream_t *stream = dataPtr->streams;
+  cublasHandle_t *handle = dataPtr->handles;
+  int tid = dataPtr->tid;

-    for (int i = 0; i < dataPtr->taskSize; i++) {
-        Task<double> &t = dataPtr->TaskListPtr[i];
+  for (int i = 0; i < dataPtr->taskSize; i++) {
+    Task<double> &t = dataPtr->TaskListPtr[i];

-        if (t.size < 100) {
-            // perform on host
-            printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
+    if (t.size < 100) {
+      // perform on host
+      printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
+             t.size);

-            // attach managed memory to a (dummy) stream to allow host access while
-            // the device is running
-            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
-            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
-            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
-            // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
-            checkCudaErrors(cudaStreamSynchronize(stream[0]));
-            // call the host operation
-            gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
-        }
-        else {
-            // perform on device
-            printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
-            double one  = 1.0;
-            double zero = 0.0;
+      // attach managed memory to a (dummy) stream to allow host access while
+      // the device is running
+      checkCudaErrors(
+          cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
+      checkCudaErrors(
+          cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
+      checkCudaErrors(
+          cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
+      // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
+      checkCudaErrors(cudaStreamSynchronize(stream[0]));
+      // call the host operation
+      gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
+    } else {
+      // perform on device
+      printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
+             t.size);
+      double one = 1.0;
+      double zero = 0.0;

-            // attach managed memory to my stream
-            checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
-            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
-            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
-            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
-            // call the device operation
-            checkCudaErrors(cublasDgemv(
-                handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
-        }
+      // attach managed memory to my stream
+      checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
+      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
+                                               cudaMemAttachSingle));
+      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
+                                               cudaMemAttachSingle));
+      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
+                                               cudaMemAttachSingle));
+      // call the device operation
+      checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
+                                  &one, t.data, t.size, t.vector, 1, &zero,
+                                  t.result, 1));
    }
+  }

-    pthread_exit(NULL);
+  pthread_exit(NULL);
 }
 #else
-template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
-{
-    if (t.size < 100) {
-        // perform on host
-        printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
+template <typename T>
+void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream,
+             int tid) {
+  if (t.size < 100) {
+    // perform on host
+    printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
+           t.size);

-        // attach managed memory to a (dummy) stream to allow host access while the
-        // device is running
-        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
-        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
-        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
-        // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
-        checkCudaErrors(cudaStreamSynchronize(stream[0]));
-        // call the host operation
-        gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
-    }
-    else {
-        // perform on device
-        printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
-        double one  = 1.0;
-        double zero = 0.0;
+    // attach managed memory to a (dummy) stream to allow host access while the
+    // device is running
+    checkCudaErrors(
+        cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
+    checkCudaErrors(
+        cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
+    checkCudaErrors(
+        cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
+    // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
+    checkCudaErrors(cudaStreamSynchronize(stream[0]));
+    // call the host operation
+    gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
+  } else {
+    // perform on device
+    printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
+           t.size);
+    double one = 1.0;
+    double zero = 0.0;

-        // attach managed memory to my stream
-        checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
-        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
-        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
-        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
-        // call the device operation
-        checkCudaErrors(cublasDgemv(
-            handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
-    }
+    // attach managed memory to my stream
+    checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
+    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
+                                             cudaMemAttachSingle));
+    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
+                                             cudaMemAttachSingle));
+    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
+                                             cudaMemAttachSingle));
+    // call the device operation
+    checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
+                                &one, t.data, t.size, t.vector, 1, &zero,
+                                t.result, 1));
+  }
 }
 #endif

 // populate a list of tasks with random sizes
-template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
-{
-    for (unsigned int i = 0; i < TaskList.size(); i++) {
-        // generate random size
-        int size;
-        size = std::max((int)(drand48() * 1000.0), 64);
-        TaskList[i].allocate(size, i);
-    }
+template <typename T>
+void initialise_tasks(std::vector<Task<T> > &TaskList) {
+  for (unsigned int i = 0; i < TaskList.size(); i++) {
+    // generate random size
+    int size;
+    size = std::max((int)(drand48() * 1000.0), 64);
+    TaskList[i].allocate(size, i);
+  }
 }

-int main(int argc, char **argv)
-{
-    // set device
-    cudaDeviceProp device_prop;
-    int            dev_id = findCudaDevice(argc, (const char **)argv);
-    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
+int main(int argc, char **argv) {
+  // set device
+  cudaDeviceProp device_prop;
+  int dev_id = findCudaDevice(argc, (const char **)argv);
+  checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));

-    if (!device_prop.managedMemory) {
-        // This samples requires being run on a device that supports Unified Memory
-        fprintf(stderr, "Unified Memory not supported on this device\n");
+  if (!device_prop.managedMemory) {
+    // This samples requires being run on a device that supports Unified Memory
+    fprintf(stderr, "Unified Memory not supported on this device\n");

-        exit(EXIT_WAIVED);
-    }
+    exit(EXIT_WAIVED);
+  }

-    int computeMode;
-    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, dev_id));
-    if (computeMode == cudaComputeModeProhibited) {
-        // This sample requires being run with a default or process exclusive mode
-        fprintf(stderr,
-                "This sample requires a device in either default or process "
-                "exclusive mode\n");
+  if (device_prop.computeMode == cudaComputeModeProhibited) {
+    // This sample requires being run with a default or process exclusive mode
+    fprintf(stderr,
+            "This sample requires a device in either default or process "
+            "exclusive mode\n");

-        exit(EXIT_WAIVED);
-    }
+    exit(EXIT_WAIVED);
+  }

-    // randomise task sizes
-    int seed = (int)time(NULL);
-    srand48(seed);
+  // randomise task sizes
+  int seed = (int)time(NULL);
+  srand48(seed);

-    // set number of threads
-    const int nthreads = 4;
+  // set number of threads
+  const int nthreads = 4;

-    // number of streams = number of threads
-    cudaStream_t   *streams = new cudaStream_t[nthreads + 1];
-    cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
+  // number of streams = number of threads
+  cudaStream_t *streams = new cudaStream_t[nthreads + 1];
+  cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];

-    for (int i = 0; i < nthreads + 1; i++) {
-        checkCudaErrors(cudaStreamCreate(&streams[i]));
-        checkCudaErrors(cublasCreate(&handles[i]));
-    }
+  for (int i = 0; i < nthreads + 1; i++) {
+    checkCudaErrors(cudaStreamCreate(&streams[i]));
+    checkCudaErrors(cublasCreate(&handles[i]));
+  }

-    // create list of N tasks
-    unsigned int              N = 40;
-    std::vector<Task<double>> TaskList(N);
-    initialise_tasks(TaskList);
+  // create list of N tasks
+  unsigned int N = 40;
+  std::vector<Task<double> > TaskList(N);
+  initialise_tasks(TaskList);

-    printf("Executing tasks on host / device\n");
+  printf("Executing tasks on host / device\n");

 // run through all tasks using threads and streams
 #ifdef USE_PTHREADS
-    pthread_t   threads[nthreads];
-    threadData *InputToThreads = new threadData[nthreads];
+  pthread_t threads[nthreads];
+  threadData *InputToThreads = new threadData[nthreads];

-    for (int i = 0; i < nthreads; i++) {
-        checkCudaErrors(cudaSetDevice(dev_id));
-        InputToThreads[i].tid     = i;
-        InputToThreads[i].streams = streams;
-        InputToThreads[i].handles = handles;
+  for (int i = 0; i < nthreads; i++) {
+    checkCudaErrors(cudaSetDevice(dev_id));
+    InputToThreads[i].tid = i;
+    InputToThreads[i].streams = streams;
+    InputToThreads[i].handles = handles;

-        if ((TaskList.size() / nthreads) == 0) {
-            InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
-            InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
-        }
-        else {
-            if (i == nthreads - 1) {
-                InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
-                InputToThreads[i].TaskListPtr =
-                    &TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
-            }
-            else {
-                InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
-                InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
-            }
-        }
-
-        pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
-    }
-    for (int i = 0; i < nthreads; i++) {
-        pthread_join(threads[i], NULL);
+    if ((TaskList.size() / nthreads) == 0) {
+      InputToThreads[i].taskSize = (TaskList.size() / nthreads);
+      InputToThreads[i].TaskListPtr =
+          &TaskList[i * (TaskList.size() / nthreads)];
+    } else {
+      if (i == nthreads - 1) {
+        InputToThreads[i].taskSize =
+            (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
+        InputToThreads[i].TaskListPtr =
+            &TaskList[i * (TaskList.size() / nthreads) +
+                      (TaskList.size() % nthreads)];
+      } else {
+        InputToThreads[i].taskSize = (TaskList.size() / nthreads);
+        InputToThreads[i].TaskListPtr =
+            &TaskList[i * (TaskList.size() / nthreads)];
+      }
    }
+
+    pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
+  }
+  for (int i = 0; i < nthreads; i++) {
+    pthread_join(threads[i], NULL);
+  }
 #else
-    omp_set_num_threads(nthreads);
+  omp_set_num_threads(nthreads);
 #pragma omp parallel for schedule(dynamic)
-    for (int i = 0; i < TaskList.size(); i++) {
-        checkCudaErrors(cudaSetDevice(dev_id));
-        int tid = omp_get_thread_num();
-        execute(TaskList[i], handles, streams, tid);
-    }
+  for (int i = 0; i < TaskList.size(); i++) {
+    checkCudaErrors(cudaSetDevice(dev_id));
+    int tid = omp_get_thread_num();
+    execute(TaskList[i], handles, streams, tid);
+  }
 #endif

-    cudaDeviceSynchronize();
+  cudaDeviceSynchronize();

-    // Destroy CUDA Streams, cuBlas handles
-    for (int i = 0; i < nthreads + 1; i++) {
-        cudaStreamDestroy(streams[i]);
-        cublasDestroy(handles[i]);
-    }
+  // Destroy CUDA Streams, cuBlas handles
+  for (int i = 0; i < nthreads + 1; i++) {
+    cudaStreamDestroy(streams[i]);
+    cublasDestroy(handles[i]);
+  }

-    // Free TaskList
-    std::vector<Task<double>>().swap(TaskList);
+  // Free TaskList
+  std::vector<Task<double> >().swap(TaskList);

-    printf("All Done!\n");
-    exit(EXIT_SUCCESS);
+  printf("All Done!\n");
+  exit(EXIT_SUCCESS);
 }
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2017.sln
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryStreams", "UnifiedMemoryStreams_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2017.vcxproj
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2017.vcxproj
@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>UnifiedMemoryStreams_vs2017</RootNamespace>
+    <ProjectName>UnifiedMemoryStreams</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/UnifiedMemoryStreams.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+      <AdditionalCompilerOptions>/openmp</AdditionalCompilerOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="UnifiedMemoryStreams.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2019.sln
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryStreams", "UnifiedMemoryStreams_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2019.vcxproj
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2019.vcxproj
@ -0,0 +1,109 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>UnifiedMemoryStreams_vs2019</RootNamespace>
+    <ProjectName>UnifiedMemoryStreams</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/UnifiedMemoryStreams.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+      <AdditionalCompilerOptions>/openmp</AdditionalCompilerOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="UnifiedMemoryStreams.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2022.sln
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2022.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2022
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryStreams", "UnifiedMemoryStreams_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2022.vcxproj
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams_vs2022.vcxproj
@ -0,0 +1,109 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>UnifiedMemoryStreams_vs2022</RootNamespace>
+    <ProjectName>UnifiedMemoryStreams</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v143</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/UnifiedMemoryStreams.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+      <AdditionalCompilerOptions>/openmp</AdditionalCompilerOptions>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="UnifiedMemoryStreams.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/asyncAPI/.vscode/tasks.json
+++ b/Samples/0_Introduction/asyncAPI/.vscode/tasks.json
@ -0,0 +1,15 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "sample",
+            "type": "shell",
+            "command": "make dbg=1",
+            "problemMatcher": ["$nvcc"],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
--- a/Samples/0_Introduction/asyncAPI/CMakeLists.txt
+++ b/Samples/0_Introduction/asyncAPI/CMakeLists.txt
@ -1,34 +0,0 @@
-cmake_minimum_required(VERSION 3.20)
-
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
-
-project(asyncAPI LANGUAGES C CXX CUDA)
-
-find_package(CUDAToolkit REQUIRED)
-
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-
-set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
-endif()
-
-# Include directories and libraries
-include_directories(../../../Common)
-
-# Source file
-# Add target for asyncAPI
-add_executable(asyncAPI asyncAPI.cu)
-
-target_compile_options(asyncAPI PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
-
-target_compile_features(asyncAPI PRIVATE cxx_std_17 cuda_std_17)
-
-set_target_properties(asyncAPI PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
-# Include installation configuration
-include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
-setup_samples_install()
--- a/Samples/0_Introduction/asyncAPI/Makefile
+++ b/Samples/0_Introduction/asyncAPI/Makefile
@ -0,0 +1,341 @@
+################################################################################
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += -D_QNX_SOURCE
+        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
+SMS ?= 53 61 70 72 75 80 86 87 90
+else
+SMS ?= 50 52 60 61 70 75 80 86 89 90
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --threads 0 --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: asyncAPI
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+asyncAPI.o:asyncAPI.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+asyncAPI: asyncAPI.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./asyncAPI
+
+testrun: build
+	$(EXEC) ./asyncAPI --dummy-test-param
+
+clean:
+	rm -f asyncAPI asyncAPI.o
+	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/asyncAPI
+
+clobber: clean
--- a/Samples/0_Introduction/asyncAPI/NsightEclipse.xml
+++ b/Samples/0_Introduction/asyncAPI/NsightEclipse.xml
@ -0,0 +1,90 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>asyncAPI</name>
+  <cuda_api_list>
+    <toolkit>cudaProfilerStop</toolkit>
+    <toolkit>cudaMalloc</toolkit>
+    <toolkit>cudaMemcpyAsync</toolkit>
+    <toolkit>cudaFree</toolkit>
+    <toolkit>cudaMallocHost</toolkit>
+    <toolkit>cudaProfilerStart</toolkit>
+    <toolkit>cudaDeviceSynchronize</toolkit>
+    <toolkit>cudaEventRecord</toolkit>
+    <toolkit>cudaFreeHost</toolkit>
+    <toolkit>cudaMemset</toolkit>
+    <toolkit>cudaEventDestroy</toolkit>
+    <toolkit>cudaEventQuery</toolkit>
+    <toolkit>cudaEventElapsedTime</toolkit>
+    <toolkit>cudaGetDeviceProperties</toolkit>
+    <toolkit>cudaEventCreate</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This sample illustrates the usage of CUDA events for both GPU timing and overlapping CPU and GPU execution. Events are inserted into a stream of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can perform computations while GPU is executing (including DMA memcopies between the host and device). CPU can query CUDA events to determine whether GPU has completed tasks.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../../Common</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Asynchronous Data Transfers</concept>
+    <concept level="basic">CUDA Streams and Events</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>GPGPU</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>asyncAPI.cu</primary_file>
+  <qatests>
+    <qatest>--dummy-test-param</qatest>
+  </qatests>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>1:Performance Strategies</scope>
+  </scopes>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm53</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <sm-arch>sm86</sm-arch>
+  <sm-arch>sm87</sm-arch>
+  <sm-arch>sm89</sm-arch>
+  <sm-arch>sm90</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>sbsa</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>asyncAPI</title>
+  <type>exe</type>
+</entry>
--- a/Samples/0_Introduction/asyncAPI/README.md
+++ b/Samples/0_Introduction/asyncAPI/README.md
@ -18,7 +18,7 @@ Linux, Windows

 ## Supported CPU Architecture

-x86_64, armv7l
+x86_64, ppc64le, armv7l

 ## CUDA APIs involved

@ -27,6 +27,44 @@ cudaProfilerStop, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaMallocHost, cudaPro

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```

 ## References (for more details)
+
--- a/Samples/0_Introduction/asyncAPI/asyncAPI.cu
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI.cu
@ -38,107 +38,105 @@
 #include <stdio.h>

 // includes CUDA Runtime
-#include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
+#include <cuda_profiler_api.h>

 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h> // helper utility functions
+#include <helper_functions.h>  // helper utility functions

-__global__ void increment_kernel(int *g_data, int inc_value)
-{
-    int idx     = blockIdx.x * blockDim.x + threadIdx.x;
-    g_data[idx] = g_data[idx] + inc_value;
+__global__ void increment_kernel(int *g_data, int inc_value) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  g_data[idx] = g_data[idx] + inc_value;
 }

-bool correct_output(int *data, const int n, const int x)
-{
-    for (int i = 0; i < n; i++)
-        if (data[i] != x) {
-            printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
-            return false;
-        }
-
-    return true;
-}
-
-int main(int argc, char *argv[])
-{
-    int            devID;
-    cudaDeviceProp deviceProps;
-
-    printf("[%s] - Starting...\n", argv[0]);
-
-    // This will pick the best possible CUDA capable device
-    devID = findCudaDevice(argc, (const char **)argv);
-
-    // get device name
-    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-    printf("CUDA device [%s]\n", deviceProps.name);
-
-    int n      = 16 * 1024 * 1024;
-    int nbytes = n * sizeof(int);
-    int value  = 26;
-
-    // allocate host memory
-    int *a = 0;
-    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
-    memset(a, 0, nbytes);
-
-    // allocate device memory
-    int *d_a = 0;
-    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
-    checkCudaErrors(cudaMemset(d_a, 255, nbytes));
-
-    // set kernel launch configuration
-    dim3 threads = dim3(512, 1);
-    dim3 blocks  = dim3(n / threads.x, 1);
-
-    // create cuda event handles
-    cudaEvent_t start, stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-
-    StopWatchInterface *timer = NULL;
-    sdkCreateTimer(&timer);
-    sdkResetTimer(&timer);
-
-    checkCudaErrors(cudaDeviceSynchronize());
-    float gpu_time = 0.0f;
-
-    // asynchronously issue work to the GPU (all to stream 0)
-    checkCudaErrors(cudaProfilerStart());
-    sdkStartTimer(&timer);
-    cudaEventRecord(start, 0);
-    cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
-    increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
-    cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
-    cudaEventRecord(stop, 0);
-    sdkStopTimer(&timer);
-    checkCudaErrors(cudaProfilerStop());
-
-    // have CPU do some work while waiting for stage 1 to finish
-    unsigned long int counter = 0;
-
-    while (cudaEventQuery(stop) == cudaErrorNotReady) {
-        counter++;
+bool correct_output(int *data, const int n, const int x) {
+  for (int i = 0; i < n; i++)
+    if (data[i] != x) {
+      printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
+      return false;
    }

-    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
-
-    // print the cpu and gpu times
-    printf("time spent executing by the GPU: %.2f\n", gpu_time);
-    printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
-    printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
-
-    // check the output for correctness
-    bool bFinalResults = correct_output(a, n, value);
-
-    // release resources
-    checkCudaErrors(cudaEventDestroy(start));
-    checkCudaErrors(cudaEventDestroy(stop));
-    checkCudaErrors(cudaFreeHost(a));
-    checkCudaErrors(cudaFree(d_a));
-
-    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
+  return true;
+}
+
+int main(int argc, char *argv[]) {
+  int devID;
+  cudaDeviceProp deviceProps;
+
+  printf("[%s] - Starting...\n", argv[0]);
+
+  // This will pick the best possible CUDA capable device
+  devID = findCudaDevice(argc, (const char **)argv);
+
+  // get device name
+  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+  printf("CUDA device [%s]\n", deviceProps.name);
+
+  int n = 16 * 1024 * 1024;
+  int nbytes = n * sizeof(int);
+  int value = 26;
+
+  // allocate host memory
+  int *a = 0;
+  checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
+  memset(a, 0, nbytes);
+
+  // allocate device memory
+  int *d_a = 0;
+  checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
+  checkCudaErrors(cudaMemset(d_a, 255, nbytes));
+
+  // set kernel launch configuration
+  dim3 threads = dim3(512, 1);
+  dim3 blocks = dim3(n / threads.x, 1);
+
+  // create cuda event handles
+  cudaEvent_t start, stop;
+  checkCudaErrors(cudaEventCreate(&start));
+  checkCudaErrors(cudaEventCreate(&stop));
+
+  StopWatchInterface *timer = NULL;
+  sdkCreateTimer(&timer);
+  sdkResetTimer(&timer);
+
+  checkCudaErrors(cudaDeviceSynchronize());
+  float gpu_time = 0.0f;
+
+  // asynchronously issue work to the GPU (all to stream 0)
+  checkCudaErrors(cudaProfilerStart());
+  sdkStartTimer(&timer);
+  cudaEventRecord(start, 0);
+  cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
+  increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
+  cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
+  cudaEventRecord(stop, 0);
+  sdkStopTimer(&timer);
+  checkCudaErrors(cudaProfilerStop());
+
+  // have CPU do some work while waiting for stage 1 to finish
+  unsigned long int counter = 0;
+
+  while (cudaEventQuery(stop) == cudaErrorNotReady) {
+    counter++;
+  }
+
+  checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
+
+  // print the cpu and gpu times
+  printf("time spent executing by the GPU: %.2f\n", gpu_time);
+  printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
+  printf("CPU executed %lu iterations while waiting for GPU to finish\n",
+         counter);
+
+  // check the output for correctness
+  bool bFinalResults = correct_output(a, n, value);
+
+  // release resources
+  checkCudaErrors(cudaEventDestroy(start));
+  checkCudaErrors(cudaEventDestroy(stop));
+  checkCudaErrors(cudaFreeHost(a));
+  checkCudaErrors(cudaFree(d_a));
+
+  exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/asyncAPI/asyncAPI_vs2017.sln
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "asyncAPI", "asyncAPI_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/asyncAPI/asyncAPI_vs2017.vcxproj
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI_vs2017.vcxproj
@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>asyncAPI_vs2017</RootNamespace>
+    <ProjectName>asyncAPI</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/asyncAPI.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="asyncAPI.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/asyncAPI/asyncAPI_vs2019.sln
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "asyncAPI", "asyncAPI_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/asyncAPI/asyncAPI_vs2019.vcxproj
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI_vs2019.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>asyncAPI_vs2019</RootNamespace>
+    <ProjectName>asyncAPI</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/asyncAPI.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="asyncAPI.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/asyncAPI/asyncAPI_vs2022.sln
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI_vs2022.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2022
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "asyncAPI", "asyncAPI_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/asyncAPI/asyncAPI_vs2022.vcxproj
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI_vs2022.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>asyncAPI_vs2022</RootNamespace>
+    <ProjectName>asyncAPI</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v143</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/asyncAPI.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="asyncAPI.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/3_CUDA_Features/graphConditionalNodes/.vscode/c_cpp_properties.json
+++ b/Samples/3_CUDA_Features/graphConditionalNodes/.vscode/c_cpp_properties.json
--- a/Samples/3_CUDA_Features/graphConditionalNodes/.vscode/extensions.json
+++ b/Samples/3_CUDA_Features/graphConditionalNodes/.vscode/extensions.json
--- a/Samples/6_Performance/cudaGraphsPerfScaling/.vscode/launch.json
+++ b/Samples/6_Performance/cudaGraphsPerfScaling/.vscode/launch.json
@ -4,7 +4,7 @@
            "name": "CUDA C++: Launch",
            "type": "cuda-gdb",
            "request": "launch",
-            "program": "${workspaceFolder}/cudaGraphsPerfScaling"
+            "program": "${workspaceFolder}/c++11_cuda"
        }
    ]
 }
--- a/Samples/0_Introduction/c++11_cuda/.vscode/tasks.json
+++ b/Samples/0_Introduction/c++11_cuda/.vscode/tasks.json
@ -0,0 +1,15 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "sample",
+            "type": "shell",
+            "command": "make dbg=1",
+            "problemMatcher": ["$nvcc"],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
--- a/Samples/0_Introduction/c++11_cuda/Makefile
+++ b/Samples/0_Introduction/c++11_cuda/Makefile
@ -0,0 +1,371 @@
+################################################################################
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += -D_QNX_SOURCE
+        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - c++11_cuda is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../../Common
+LIBRARIES :=
+
+################################################################################
+
+#Detect if installed version of GCC supports required C++11
+ifeq ($(TARGET_OS),linux)
+    empty :=
+    space := $(empty) $(empty)
+    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
+#Create version number without "."
+    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
+    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
+# Make sure the version number has at least 3 decimals
+    GCCVERSION += 00
+# Remove spaces from the version number
+    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
+#$(warning $(GCCVERSION))
+
+    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)
+
+    ifeq ($(IS_MIN_VERSION), 1)
+        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
+    else
+        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
+        SAMPLE_ENABLED := 0
+    endif
+endif
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
+SMS ?= 53 61 70 72 75 80 86 87 90
+else
+SMS ?= 50 52 60 61 70 75 80 86 89 90
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --std=c++11 --threads 0
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: c++11_cuda
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+c++11_cuda.o:c++11_cuda.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+c++11_cuda: c++11_cuda.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./c++11_cuda
+
+testrun: build
+
+clean:
+	rm -f c++11_cuda c++11_cuda.o
+	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/c++11_cuda
+
+clobber: clean
--- a/Samples/0_Introduction/c++11_cuda/NsightEclipse.xml
+++ b/Samples/0_Introduction/c++11_cuda/NsightEclipse.xml
@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>c++11_cuda</name>
+  <cflags>
+    <flag>--std=c++11</flag>
+  </cflags>
+  <cuda_api_list>
+    <toolkit>cudaMalloc</toolkit>
+    <toolkit>cudaMemcpy</toolkit>
+    <toolkit>cudaMemset</toolkit>
+    <toolkit>cudaFree</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This sample demonstrates C++11 feature support in CUDA. It scans a input text file and prints no. of occurrences of x, y, z, w characters. ]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../../Common</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="advanced">CPP11 CUDA</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>GPGPU</keyword>
+    <keyword>CPP11</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>c++11_cuda.cu</primary_file>
+  <required_dependencies>
+    <dependency>CPP11</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Advanced Topics</scope>
+    <scope>1:C++11 CUDA</scope>
+  </scopes>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm53</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <sm-arch>sm86</sm-arch>
+  <sm-arch>sm87</sm-arch>
+  <sm-arch>sm89</sm-arch>
+  <sm-arch>sm90</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>sbsa</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>C++11 CUDA</title>
+  <type>exe</type>
+</entry>
--- a/Samples/0_Introduction/c++11_cuda/README.md
+++ b/Samples/0_Introduction/c++11_cuda/README.md
@ -0,0 +1,74 @@
+# c++11_cuda - C++11 CUDA
+
+## Description
+
+This sample demonstrates C++11 feature support in CUDA. It scans a input text file and prints no. of occurrences of x, y, z, w characters.
+
+## Key Concepts
+
+CPP11 CUDA
+
+## Supported SM Architectures
+
+[SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.9 ](https://developer.nvidia.com/cuda-gpus)  [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaMalloc, cudaMemcpy, cudaMemset, cudaFree
+
+## Dependencies needed to build/run
+[CPP11](../../../README.md#cpp11)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 12.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda.cu
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda.cu
@ -0,0 +1,140 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <thrust/device_ptr.h>
+#include <thrust/count.h>
+#include <thrust/execution_policy.h>
+
+#include <iostream>
+#include <helper_cuda.h>
+
+/////////////////////////////////////////////////////////////////
+// Some utility code to define grid_stride_range
+// Normally this would be in a header but it's here
+// for didactic purposes. Uses
+#include "range.hpp"
+using namespace util::lang;
+
+// type alias to simplify typing...
+template <typename T>
+using step_range = typename range_proxy<T>::step_range_proxy;
+
+template <typename T>
+__device__ step_range<T> grid_stride_range(T begin, T end) {
+  begin += blockDim.x * blockIdx.x + threadIdx.x;
+  return range(begin, end).step(gridDim.x * blockDim.x);
+}
+/////////////////////////////////////////////////////////////////
+
+template <typename T, typename Predicate>
+__device__ void count_if(int *count, T *data, int n, Predicate p) {
+  for (auto i : grid_stride_range(0, n)) {
+    if (p(data[i])) atomicAdd(count, 1);
+  }
+}
+
+// Use count_if with a lambda function that searches for x, y, z or w
+// Note the use of range-based for loop and initializer_list inside the functor
+// We use auto so we don't have to know the type of the functor or array
+__global__ void xyzw_frequency(int *count, char *text, int n) {
+  const char letters[]{'x', 'y', 'z', 'w'};
+
+  count_if(count, text, n, [&](char c) {
+    for (const auto x : letters)
+      if (c == x) return true;
+    return false;
+  });
+}
+
+__global__ void xyzw_frequency_thrust_device(int *count, char *text, int n) {
+  const char letters[]{'x', 'y', 'z', 'w'};
+  *count = thrust::count_if(thrust::device, text, text + n, [=](char c) {
+    for (const auto x : letters)
+      if (c == x) return true;
+    return false;
+  });
+}
+
+// a bug in Thrust 1.8 causes warnings when this is uncommented
+// so commented out by default -- fixed in Thrust master branch
+#if 0 
+void xyzw_frequency_thrust_host(int *count, char *text, int n)
+{
+  const char letters[] { 'x','y','z','w' };
+  *count = thrust::count_if(thrust::host, text, text+n, [&](char c) {
+    for (const auto x : letters) 
+      if (c == x) return true;
+    return false;
+  });
+}
+#endif
+
+int main(int argc, char **argv) {
+  const char *filename = sdkFindFilePath("warandpeace.txt", argv[0]);
+
+  int numBytes = 16 * 1048576;
+  char *h_text = (char *)malloc(numBytes);
+
+  // find first CUDA device
+  int devID = findCudaDevice(argc, (const char **)argv);
+
+  char *d_text;
+  checkCudaErrors(cudaMalloc((void **)&d_text, numBytes));
+
+  FILE *fp = fopen(filename, "r");
+  if (fp == NULL) {
+    printf("Cannot find the input text file\n. Exiting..\n");
+    return EXIT_FAILURE;
+  }
+  int len = (int)fread(h_text, sizeof(char), numBytes, fp);
+  fclose(fp);
+  std::cout << "Read " << len << " byte corpus from " << filename << std::endl;
+
+  checkCudaErrors(cudaMemcpy(d_text, h_text, len, cudaMemcpyHostToDevice));
+
+  int count = 0;
+  int *d_count;
+  checkCudaErrors(cudaMalloc(&d_count, sizeof(int)));
+  checkCudaErrors(cudaMemset(d_count, 0, sizeof(int)));
+
+  // Try uncommenting one kernel call at a time
+  xyzw_frequency<<<8, 256>>>(d_count, d_text, len);
+  xyzw_frequency_thrust_device<<<1, 1>>>(d_count, d_text, len);
+  checkCudaErrors(
+      cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDeviceToHost));
+
+  // xyzw_frequency_thrust_host(&count, h_text, len);
+
+  std::cout << "counted " << count
+            << " instances of 'x', 'y', 'z', or 'w' in \"" << filename << "\""
+            << std::endl;
+
+  checkCudaErrors(cudaFree(d_count));
+  checkCudaErrors(cudaFree(d_text));
+
+  return EXIT_SUCCESS;
+}
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2017.sln
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "c++11_cuda", "c++11_cuda_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2017.vcxproj
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2017.vcxproj
@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>c++11_cuda_vs2017</RootNamespace>
+    <ProjectName>c++11_cuda</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/c++11_cuda.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="c++11_cuda.cu" />
+    <ClInclude Include="range.hpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2019.sln
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "c++11_cuda", "c++11_cuda_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2019.vcxproj
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2019.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>c++11_cuda_vs2019</RootNamespace>
+    <ProjectName>c++11_cuda</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/c++11_cuda.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="c++11_cuda.cu" />
+    <ClInclude Include="range.hpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2022.sln
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2022.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2022
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "c++11_cuda", "c++11_cuda_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2022.vcxproj
+++ b/Samples/0_Introduction/c++11_cuda/c++11_cuda_vs2022.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>c++11_cuda_vs2022</RootNamespace>
+    <ProjectName>c++11_cuda</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v143</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/c++11_cuda.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="c++11_cuda.cu" />
+    <ClInclude Include="range.hpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/c++11_cuda/range.hpp
+++ b/Samples/0_Introduction/c++11_cuda/range.hpp
@ -0,0 +1,279 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef UTIL_LANG_RANGE_HPP
+#define UTIL_LANG_RANGE_HPP
+
+#include <iterator>
+#include <type_traits>
+
+// Make these ranges usable inside CUDA C++ device code
+#ifdef __CUDACC__
+#define DEVICE_CALLABLE __host__ __device__
+#else
+#define DEVICE_CALLABLE
+#endif
+
+namespace util {
+namespace lang {
+
+namespace detail {
+
+template <typename T>
+struct range_iter_base : std::iterator<std::input_iterator_tag, T> {
+  DEVICE_CALLABLE
+  range_iter_base(T current) : current(current) {}
+
+  DEVICE_CALLABLE
+  T operator*() const { return current; }
+
+  DEVICE_CALLABLE
+  T const* operator->() const { return &current; }
+
+  DEVICE_CALLABLE
+  range_iter_base& operator++() {
+    ++current;
+    return *this;
+  }
+
+  DEVICE_CALLABLE
+  range_iter_base operator++(int) {
+    auto copy = *this;
+    ++*this;
+    return copy;
+  }
+
+  DEVICE_CALLABLE
+  bool operator==(range_iter_base const& other) const {
+    return current == other.current;
+  }
+
+  DEVICE_CALLABLE
+  bool operator!=(range_iter_base const& other) const {
+    return not(*this == other);
+  }
+
+ protected:
+  T current;
+};
+
+}  // namespace detail
+
+template <typename T>
+struct range_proxy {
+  struct iter : detail::range_iter_base<T> {
+    DEVICE_CALLABLE
+    iter(T current) : detail::range_iter_base<T>(current) {}
+  };
+
+  struct step_range_proxy {
+    struct iter : detail::range_iter_base<T> {
+      DEVICE_CALLABLE
+      iter(T current, T step)
+          : detail::range_iter_base<T>(current), step(step) {}
+
+      using detail::range_iter_base<T>::current;
+
+      DEVICE_CALLABLE
+      iter& operator++() {
+        current += step;
+        return *this;
+      }
+
+      DEVICE_CALLABLE
+      iter operator++(int) {
+        auto copy = *this;
+        ++*this;
+        return copy;
+      }
+
+      // Loses commutativity. Iterator-based ranges are simply broken. :-(
+      DEVICE_CALLABLE
+      bool operator==(iter const& other) const {
+        return step > 0 ? current >= other.current : current < other.current;
+      }
+
+      DEVICE_CALLABLE
+      bool operator!=(iter const& other) const { return !(*this == other); }
+
+     private:
+      T step;
+    };
+
+    DEVICE_CALLABLE
+    step_range_proxy(T begin, T end, T step)
+        : begin_(begin, step), end_(end, step) {}
+
+    DEVICE_CALLABLE
+    iter begin() const { return begin_; }
+
+    DEVICE_CALLABLE
+    iter end() const { return end_; }
+
+   private:
+    iter begin_;
+    iter end_;
+  };
+
+  DEVICE_CALLABLE
+  range_proxy(T begin, T end) : begin_(begin), end_(end) {}
+
+  DEVICE_CALLABLE
+  step_range_proxy step(T step) { return {*begin_, *end_, step}; }
+
+  DEVICE_CALLABLE
+  iter begin() const { return begin_; }
+
+  DEVICE_CALLABLE
+  iter end() const { return end_; }
+
+ private:
+  iter begin_;
+  iter end_;
+};
+
+template <typename T>
+struct infinite_range_proxy {
+  struct iter : detail::range_iter_base<T> {
+    DEVICE_CALLABLE
+    iter(T current = T()) : detail::range_iter_base<T>(current) {}
+
+    DEVICE_CALLABLE
+    bool operator==(iter const&) const { return false; }
+
+    DEVICE_CALLABLE
+    bool operator!=(iter const&) const { return true; }
+  };
+
+  struct step_range_proxy {
+    struct iter : detail::range_iter_base<T> {
+      DEVICE_CALLABLE
+      iter(T current = T(), T step = T())
+          : detail::range_iter_base<T>(current), step(step) {}
+
+      using detail::range_iter_base<T>::current;
+
+      DEVICE_CALLABLE
+      iter& operator++() {
+        current += step;
+        return *this;
+      }
+
+      DEVICE_CALLABLE
+      iter operator++(int) {
+        auto copy = *this;
+        ++*this;
+        return copy;
+      }
+
+      DEVICE_CALLABLE
+      bool operator==(iter const&) const { return false; }
+
+      DEVICE_CALLABLE
+      bool operator!=(iter const&) const { return true; }
+
+     private:
+      T step;
+    };
+
+    DEVICE_CALLABLE
+    step_range_proxy(T begin, T step) : begin_(begin, step) {}
+
+    DEVICE_CALLABLE
+    iter begin() const { return begin_; }
+
+    DEVICE_CALLABLE
+    iter end() const { return iter(); }
+
+   private:
+    iter begin_;
+  };
+
+  DEVICE_CALLABLE
+  infinite_range_proxy(T begin) : begin_(begin) {}
+
+  DEVICE_CALLABLE
+  step_range_proxy step(T step) { return step_range_proxy(*begin_, step); }
+
+  DEVICE_CALLABLE
+  iter begin() const { return begin_; }
+
+  DEVICE_CALLABLE
+  iter end() const { return iter(); }
+
+ private:
+  iter begin_;
+};
+
+template <typename T>
+DEVICE_CALLABLE range_proxy<T> range(T begin, T end) {
+  return {begin, end};
+}
+
+template <typename T>
+DEVICE_CALLABLE infinite_range_proxy<T> range(T begin) {
+  return {begin};
+}
+
+namespace traits {
+
+template <typename C>
+struct has_size {
+  template <typename T>
+  static constexpr auto check(T*) ->
+      typename std::is_integral<decltype(std::declval<T const>().size())>::type;
+
+  template <typename>
+  static constexpr auto check(...) -> std::false_type;
+
+  using type = decltype(check<C>(0));
+  static constexpr bool value = type::value;
+};
+
+}  // namespace traits
+
+template <typename C,
+          typename = typename std::enable_if<traits::has_size<C>::value>>
+DEVICE_CALLABLE auto indices(C const& cont)
+    -> range_proxy<decltype(cont.size())> {
+  return {0, cont.size()};
+}
+
+template <typename T, std::size_t N>
+DEVICE_CALLABLE range_proxy<std::size_t> indices(T(&)[N]) {
+  return {0, N};
+}
+
+template <typename T>
+range_proxy<typename std::initializer_list<T>::size_type> DEVICE_CALLABLE
+indices(std::initializer_list<T>&& cont) {
+  return {0, cont.size()};
+}
+}
+}  // namespace util::lang
+
+#endif  // ndef UTIL_LANG_RANGE_HPP
--- a/Samples/0_Introduction/c++11_cuda/warandpeace.txt
+++ b/Samples/0_Introduction/c++11_cuda/warandpeace.txt
--- a/Samples/0_Introduction/clock/.vscode/tasks.json
+++ b/Samples/0_Introduction/clock/.vscode/tasks.json
@ -0,0 +1,15 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "sample",
+            "type": "shell",
+            "command": "make dbg=1",
+            "problemMatcher": ["$nvcc"],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
--- a/Samples/0_Introduction/clock/CMakeLists.txt
+++ b/Samples/0_Introduction/clock/CMakeLists.txt
@ -1,34 +0,0 @@
-cmake_minimum_required(VERSION 3.20)
-
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
-
-project(clock LANGUAGES C CXX CUDA)
-
-find_package(CUDAToolkit REQUIRED)
-
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-
-set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
-endif()
-
-# Include directories and libraries
-include_directories(../../../Common)
-
-# Source file
-# Add target for asyncAPI
-add_executable(clock clock.cu)
-
-target_compile_options(clock PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
-
-target_compile_features(clock PRIVATE cxx_std_17 cuda_std_17)
-
-set_target_properties(clock PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
-# Include installation configuration
-include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
-setup_samples_install()
--- a/Samples/0_Introduction/clock/Makefile
+++ b/Samples/0_Introduction/clock/Makefile
@ -0,0 +1,340 @@
+################################################################################
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += -D_QNX_SOURCE
+        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
+SMS ?= 53 61 70 72 75 80 86 87 90
+else
+SMS ?= 50 52 60 61 70 75 80 86 89 90
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --threads 0 --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: clock
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+clock.o:clock.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+clock: clock.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./clock
+
+testrun: build
+
+clean:
+	rm -f clock clock.o
+	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/clock
+
+clobber: clean
--- a/Samples/0_Introduction/clock/NsightEclipse.xml
+++ b/Samples/0_Introduction/clock/NsightEclipse.xml
@ -0,0 +1,78 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>clock</name>
+  <cuda_api_list>
+    <toolkit>cudaMalloc</toolkit>
+    <toolkit>cudaMemcpy</toolkit>
+    <toolkit>cudaFree</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This example shows how to use the clock function to measure the performance of block of threads of a kernel accurately.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../../Common</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Performance Strategies</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>performance</keyword>
+    <keyword>timing</keyword>
+    <keyword>CUDA</keyword>
+    <keyword>clock</keyword>
+    <keyword>timer</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>clock.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>1:Performance Strategies</scope>
+  </scopes>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm53</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <sm-arch>sm86</sm-arch>
+  <sm-arch>sm87</sm-arch>
+  <sm-arch>sm89</sm-arch>
+  <sm-arch>sm90</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>sbsa</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Clock</title>
+  <type>exe</type>
+</entry>
--- a/Samples/0_Introduction/clock/README.md
+++ b/Samples/0_Introduction/clock/README.md
@ -18,7 +18,7 @@ Linux, Windows

 ## Supported CPU Architecture

-x86_64, armv7l
+x86_64, ppc64le, armv7l

 ## CUDA APIs involved

@ -27,6 +27,44 @@ cudaMalloc, cudaMemcpy, cudaFree

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```

 ## References (for more details)
+
--- a/Samples/0_Introduction/clock/clock.cu
+++ b/Samples/0_Introduction/clock/clock.cu
@ -48,46 +48,43 @@
 // This kernel computes a standard parallel reduction and evaluates the
 // time it takes to do that for each block. The timing results are stored
 // in device memory.
-__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
-{
-    // __shared__ float shared[2 * blockDim.x];
-    extern __shared__ float shared[];
+__global__ static void timedReduction(const float *input, float *output,
+                                      clock_t *timer) {
+  // __shared__ float shared[2 * blockDim.x];
+  extern __shared__ float shared[];

-    const int tid = threadIdx.x;
-    const int bid = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bid = blockIdx.x;

-    if (tid == 0)
-        timer[bid] = clock();
+  if (tid == 0) timer[bid] = clock();

-    // Copy input.
-    shared[tid]              = input[tid];
-    shared[tid + blockDim.x] = input[tid + blockDim.x];
-
-    // Perform reduction to find minimum.
-    for (int d = blockDim.x; d > 0; d /= 2) {
-        __syncthreads();
-
-        if (tid < d) {
-            float f0 = shared[tid];
-            float f1 = shared[tid + d];
-
-            if (f1 < f0) {
-                shared[tid] = f1;
-            }
-        }
-    }
-
-    // Write result.
-    if (tid == 0)
-        output[bid] = shared[0];
+  // Copy input.
+  shared[tid] = input[tid];
+  shared[tid + blockDim.x] = input[tid + blockDim.x];

+  // Perform reduction to find minimum.
+  for (int d = blockDim.x; d > 0; d /= 2) {
    __syncthreads();

-    if (tid == 0)
-        timer[bid + gridDim.x] = clock();
+    if (tid < d) {
+      float f0 = shared[tid];
+      float f1 = shared[tid + d];
+
+      if (f1 < f0) {
+        shared[tid] = f1;
+      }
+    }
+  }
+
+  // Write result.
+  if (tid == 0) output[bid] = shared[0];
+
+  __syncthreads();
+
+  if (tid == 0) timer[bid + gridDim.x] = clock();
 }

-#define NUM_BLOCKS  64
+#define NUM_BLOCKS 64
 #define NUM_THREADS 256

 // It's interesting to change the number of blocks and the number of threads to
@ -107,46 +104,50 @@ __global__ static void timedReduction(const float *input, float *output, clock_t
 // the memory. With more than 32 the speed scales linearly.

 // Start the main CUDA Sample here
-int main(int argc, char **argv)
-{
-    printf("CUDA Clock sample\n");
+int main(int argc, char **argv) {
+  printf("CUDA Clock sample\n");

-    // This will pick the best possible CUDA capable device
-    int dev = findCudaDevice(argc, (const char **)argv);
+  // This will pick the best possible CUDA capable device
+  int dev = findCudaDevice(argc, (const char **)argv);

-    float   *dinput  = NULL;
-    float   *doutput = NULL;
-    clock_t *dtimer  = NULL;
+  float *dinput = NULL;
+  float *doutput = NULL;
+  clock_t *dtimer = NULL;

-    clock_t timer[NUM_BLOCKS * 2];
-    float   input[NUM_THREADS * 2];
+  clock_t timer[NUM_BLOCKS * 2];
+  float input[NUM_THREADS * 2];

-    for (int i = 0; i < NUM_THREADS * 2; i++) {
-        input[i] = (float)i;
-    }
+  for (int i = 0; i < NUM_THREADS * 2; i++) {
+    input[i] = (float)i;
+  }

-    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
-    checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
-    checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
+  checkCudaErrors(
+      cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
+  checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
+  checkCudaErrors(
+      cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));

-    checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2,
+                             cudaMemcpyHostToDevice));

-    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
+  timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(
+      dinput, doutput, dtimer);

-    checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2,
+                             cudaMemcpyDeviceToHost));

-    checkCudaErrors(cudaFree(dinput));
-    checkCudaErrors(cudaFree(doutput));
-    checkCudaErrors(cudaFree(dtimer));
+  checkCudaErrors(cudaFree(dinput));
+  checkCudaErrors(cudaFree(doutput));
+  checkCudaErrors(cudaFree(dtimer));

-    long double avgElapsedClocks = 0;
+  long double avgElapsedClocks = 0;

-    for (int i = 0; i < NUM_BLOCKS; i++) {
-        avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
-    }
+  for (int i = 0; i < NUM_BLOCKS; i++) {
+    avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
+  }

-    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
-    printf("Average clocks/block = %Lf\n", avgElapsedClocks);
+  avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
+  printf("Average clocks/block = %Lf\n", avgElapsedClocks);

-    return EXIT_SUCCESS;
+  return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/clock/clock_vs2017.sln
+++ b/Samples/0_Introduction/clock/clock_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock", "clock_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/clock/clock_vs2017.vcxproj
+++ b/Samples/0_Introduction/clock/clock_vs2017.vcxproj
@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>clock_vs2017</RootNamespace>
+    <ProjectName>clock</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/clock.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="clock.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/clock/clock_vs2019.sln
+++ b/Samples/0_Introduction/clock/clock_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock", "clock_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/clock/clock_vs2019.vcxproj
+++ b/Samples/0_Introduction/clock/clock_vs2019.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>clock_vs2019</RootNamespace>
+    <ProjectName>clock</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/clock.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="clock.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/clock/clock_vs2022.sln
+++ b/Samples/0_Introduction/clock/clock_vs2022.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2022
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock", "clock_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/clock/clock_vs2022.vcxproj
+++ b/Samples/0_Introduction/clock/clock_vs2022.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>clock_vs2022</RootNamespace>
+    <ProjectName>clock</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v143</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/clock.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="clock.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/clock_nvrtc/.vscode/tasks.json
+++ b/Samples/0_Introduction/clock_nvrtc/.vscode/tasks.json
@ -0,0 +1,15 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "sample",
+            "type": "shell",
+            "command": "make dbg=1",
+            "problemMatcher": ["$nvcc"],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
--- a/Samples/0_Introduction/clock_nvrtc/CMakeLists.txt
+++ b/Samples/0_Introduction/clock_nvrtc/CMakeLists.txt
@ -1,43 +0,0 @@
-cmake_minimum_required(VERSION 3.20)
-
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
-
-project(clock_nvrtc LANGUAGES C CXX CUDA)
-
-find_package(CUDAToolkit REQUIRED)
-
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-
-set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
-endif()
-
-# Include directories and libraries
-include_directories(../../../Common)
-
-# Source file
-# Add sample target executable
-add_executable(clock_nvrtc clock.cpp)
-
-target_compile_options(clock_nvrtc PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
-
-target_compile_features(clock_nvrtc PRIVATE cxx_std_17 cuda_std_17)
-
-target_link_libraries(clock_nvrtc PRIVATE
-    CUDA::nvrtc
-    CUDA::cuda_driver
-)
-
-# Copy clock_kernel.cu to the output directory
-add_custom_command(TARGET clock_nvrtc POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different
-    ${CMAKE_CURRENT_SOURCE_DIR}/clock_kernel.cu ${CMAKE_CURRENT_BINARY_DIR}
-)
-
-# Include installation configuration
-include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
-setup_samples_install()
--- a/Samples/0_Introduction/clock_nvrtc/Makefile
+++ b/Samples/0_Introduction/clock_nvrtc/Makefile
@ -0,0 +1,392 @@
+################################################################################
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += -D_QNX_SOURCE
+        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - clock_nvrtc is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../../Common
+LIBRARIES :=
+
+################################################################################
+
+# libNVRTC specific libraries
+ifeq ($(TARGET_OS),darwin)
+ LDFLAGS += -L$(CUDA_PATH)/lib -F/Library/Frameworks -framework CUDA
+endif
+
+ifeq ($(TARGET_OS),darwin)
+  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
+else
+  ifeq ($(TARGET_ARCH),x86_64)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib/stubs
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
+  endif
+
+  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
+    ifdef TARGET_OVERRIDE
+        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
+    endif
+  endif
+
+  ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
+  endif
+
+  ifeq ($(HOST_ARCH),ppc64le)
+    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
+  endif
+
+  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
+  ifeq ("$(CUDALIB)","")
+    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
+    SAMPLE_ENABLED := 0
+  else
+    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
+    LIBRARIES += -L$(CUDALIB) -lcuda
+  endif
+endif
+
+ALL_CCFLAGS += --threads 0 --std=c++11
+
+INCLUDES += -I$(CUDA_PATH)/include
+
+LIBRARIES += -lnvrtc
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: clock_nvrtc
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+clock.o:clock.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+clock_nvrtc: clock.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./clock_nvrtc
+
+testrun: build
+
+clean:
+	rm -f clock_nvrtc clock.o
+	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/clock_nvrtc
+
+clobber: clean
--- a/Samples/0_Introduction/clock_nvrtc/README.md
+++ b/Samples/0_Introduction/clock_nvrtc/README.md
@ -18,7 +18,7 @@ Linux, Windows, QNX

 ## Supported CPU Architecture

-x86_64, aarch64
+x86_64, ppc64le, aarch64

 ## CUDA APIs involved

@ -33,7 +33,45 @@ cudaBlockSize, cudaGridSize

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
 ## References (for more details)
+
--- a/Samples/0_Introduction/clock_nvrtc/clock.cpp
+++ b/Samples/0_Introduction/clock_nvrtc/clock.cpp
@ -34,11 +34,12 @@
 */

 // System includes
+#include <stdio.h>
+#include <stdint.h>
 #include <assert.h>
+
 #include <cuda_runtime.h>
 #include <nvrtc_helper.h>
-#include <stdint.h>
-#include <stdio.h>

 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
@ -70,68 +71,64 @@

 // Start the main CUDA Sample here

-int main(int argc, char **argv)
-{
-    printf("CUDA Clock sample\n");
+int main(int argc, char **argv) {
+  printf("CUDA Clock sample\n");

-    typedef long clock_t;
+  typedef long clock_t;

-    clock_t timer[NUM_BLOCKS * 2];
+  clock_t timer[NUM_BLOCKS * 2];

-    float input[NUM_THREADS * 2];
+  float input[NUM_THREADS * 2];

-    for (int i = 0; i < NUM_THREADS * 2; i++) {
-        input[i] = (float)i;
-    }
+  for (int i = 0; i < NUM_THREADS * 2; i++) {
+    input[i] = (float)i;
+  }

-    char  *cubin, *kernel_file;
-    size_t cubinSize;
+  char *cubin, *kernel_file;
+  size_t cubinSize;

-    kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
-    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
+  kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
+  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);

-    CUmodule   module = loadCUBIN(cubin, argc, argv);
-    CUfunction kernel_addr;
+  CUmodule module = loadCUBIN(cubin, argc, argv);
+  CUfunction kernel_addr;

-    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
+  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));

-    dim3 cudaBlockSize(NUM_THREADS, 1, 1);
-    dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
+  dim3 cudaBlockSize(NUM_THREADS, 1, 1);
+  dim3 cudaGridSize(NUM_BLOCKS, 1, 1);

-    CUdeviceptr dinput, doutput, dtimer;
-    checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
-    checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
-    checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
-    checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
+  CUdeviceptr dinput, doutput, dtimer;
+  checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
+  checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
+  checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
+  checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));

-    void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
+  void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};

-    checkCudaErrors(cuLaunchKernel(kernel_addr,
-                                   cudaGridSize.x,
-                                   cudaGridSize.y,
-                                   cudaGridSize.z, /* grid dim */
-                                   cudaBlockSize.x,
-                                   cudaBlockSize.y,
-                                   cudaBlockSize.z, /* block dim */
-                                   sizeof(float) * 2 * NUM_THREADS,
-                                   0,       /* shared mem, stream */
-                                   &arr[0], /* arguments */
-                                   0));
+  checkCudaErrors(cuLaunchKernel(
+      kernel_addr, cudaGridSize.x, cudaGridSize.y,
+      cudaGridSize.z,                                    /* grid dim */
+      cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
+      sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */
+      &arr[0],                            /* arguments */
+      0));

-    checkCudaErrors(cuCtxSynchronize());
-    checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
-    checkCudaErrors(cuMemFree(dinput));
-    checkCudaErrors(cuMemFree(doutput));
-    checkCudaErrors(cuMemFree(dtimer));
+  checkCudaErrors(cuCtxSynchronize());
+  checkCudaErrors(
+      cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
+  checkCudaErrors(cuMemFree(dinput));
+  checkCudaErrors(cuMemFree(doutput));
+  checkCudaErrors(cuMemFree(dtimer));

-    long double avgElapsedClocks = 0;
+  long double avgElapsedClocks = 0;

-    for (int i = 0; i < NUM_BLOCKS; i++) {
-        avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
-    }
+  for (int i = 0; i < NUM_BLOCKS; i++) {
+    avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
+  }

-    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
-    printf("Average clocks/block = %Lf\n", avgElapsedClocks);
+  avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
+  printf("Average clocks/block = %Lf\n", avgElapsedClocks);

-    return EXIT_SUCCESS;
+  return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu
+++ b/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu
@ -37,41 +37,38 @@
 // time it takes to do that for each block. The timing results are stored
 // in device memory.

-extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
-{
-    // __shared__ float shared[2 * blockDim.x];
-    extern __shared__ float shared[];
+extern "C" __global__ void timedReduction(const float *input, float *output,
+                                          clock_t *timer) {
+  // __shared__ float shared[2 * blockDim.x];
+  extern __shared__ float shared[];

-    const int tid = threadIdx.x;
-    const int bid = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bid = blockIdx.x;

-    if (tid == 0)
-        timer[bid] = clock();
+  if (tid == 0) timer[bid] = clock();

-    // Copy input.
-    shared[tid]              = input[tid];
-    shared[tid + blockDim.x] = input[tid + blockDim.x];
-
-    // Perform reduction to find minimum.
-    for (int d = blockDim.x; d > 0; d /= 2) {
-        __syncthreads();
-
-        if (tid < d) {
-            float f0 = shared[tid];
-            float f1 = shared[tid + d];
-
-            if (f1 < f0) {
-                shared[tid] = f1;
-            }
-        }
-    }
-
-    // Write result.
-    if (tid == 0)
-        output[bid] = shared[0];
+  // Copy input.
+  shared[tid] = input[tid];
+  shared[tid + blockDim.x] = input[tid + blockDim.x];

+  // Perform reduction to find minimum.
+  for (int d = blockDim.x; d > 0; d /= 2) {
    __syncthreads();

-    if (tid == 0)
-        timer[bid + gridDim.x] = clock();
+    if (tid < d) {
+      float f0 = shared[tid];
+      float f1 = shared[tid + d];
+
+      if (f1 < f0) {
+        shared[tid] = f1;
+      }
+    }
+  }
+
+  // Write result.
+  if (tid == 0) output[bid] = shared[0];
+
+  __syncthreads();
+
+  if (tid == 0) timer[bid + gridDim.x] = clock();
 }
--- a/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2017.sln
+++ b/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock_nvrtc", "clock_nvrtc_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2017.vcxproj
+++ b/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2017.vcxproj
@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>clock_nvrtc_vs2017</RootNamespace>
+    <ProjectName>clock_nvrtc</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/clock_nvrtc.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration></CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="clock.cpp" />
+    <None Include="clock_kernel.cu" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2019.sln
+++ b/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock_nvrtc", "clock_nvrtc_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2019.vcxproj
+++ b/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2019.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>clock_nvrtc_vs2019</RootNamespace>
+    <ProjectName>clock_nvrtc</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/clock_nvrtc.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration></CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="clock.cpp" />
+    <None Include="clock_kernel.cu" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2022.sln
+++ b/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2022.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2022
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock_nvrtc", "clock_nvrtc_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2022.vcxproj
+++ b/Samples/0_Introduction/clock_nvrtc/clock_nvrtc_vs2022.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>clock_nvrtc_vs2022</RootNamespace>
+    <ProjectName>clock_nvrtc</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v143</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/clock_nvrtc.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration></CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="clock.cpp" />
+    <None Include="clock_kernel.cu" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/6_Performance/cudaGraphsPerfScaling/.vscode/c_cpp_properties.json
+++ b/Samples/6_Performance/cudaGraphsPerfScaling/.vscode/c_cpp_properties.json
--- a/Samples/6_Performance/cudaGraphsPerfScaling/.vscode/extensions.json
+++ b/Samples/6_Performance/cudaGraphsPerfScaling/.vscode/extensions.json
--- a/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/.vscode/launch.json
+++ b/Samples/8_Platform_Specific/Tegra/cudaNvSciBufMultiplanar/.vscode/launch.json
@ -4,7 +4,7 @@
            "name": "CUDA C++: Launch",
            "type": "cuda-gdb",
            "request": "launch",
-            "program": "${workspaceFolder}/cudaNvSciBufMultiplanar"
+            "program": "${workspaceFolder}/concurrentKernels"
        }
    ]
 }
--- a/Samples/0_Introduction/concurrentKernels/.vscode/tasks.json
+++ b/Samples/0_Introduction/concurrentKernels/.vscode/tasks.json
@ -0,0 +1,15 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "sample",
+            "type": "shell",
+            "command": "make dbg=1",
+            "problemMatcher": ["$nvcc"],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
--- a/Samples/0_Introduction/concurrentKernels/Makefile
+++ b/Samples/0_Introduction/concurrentKernels/Makefile
@ -0,0 +1,340 @@
+################################################################################
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+
+# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
+ifeq ($(HOST_ARCH),aarch64)
+    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
+        HOST_ARCH := sbsa
+        TARGET_ARCH := sbsa
+    endif
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),sbsa)
+        HOST_COMPILER ?= aarch64-linux-gnu-g++
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+        NVCCFLAGS += -D_QNX_SOURCE
+        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
+        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
+        LDFLAGS += -lsocket
+        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
+        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
+        ifdef TARGET_OVERRIDE
+            LDFLAGS += -lslog2
+        endif
+
+        ifneq ($(TARGET_FS),)
+            LDFLAGS += -L$(TARGET_FS)/usr/lib
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
+            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
+            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
+            CCFLAGS += -I$(TARGET_FS)/../include
+        endif
+    endif
+endif
+
+ifdef TARGET_OVERRIDE # cuda toolkit targets override
+    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
+SMS ?= 53 61 70 72 75 80 86 87 90
+else
+SMS ?= 50 52 60 61 70 75 80 86 89 90
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += --threads 0 --std=c++11
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: concurrentKernels
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+concurrentKernels.o:concurrentKernels.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+concurrentKernels: concurrentKernels.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./concurrentKernels
+
+testrun: build
+
+clean:
+	rm -f concurrentKernels concurrentKernels.o
+	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/concurrentKernels
+
+clobber: clean
--- a/Samples/0_Introduction/concurrentKernels/NsightEclipse.xml
+++ b/Samples/0_Introduction/concurrentKernels/NsightEclipse.xml
@ -0,0 +1,87 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>concurrentKernels</name>
+  <cuda_api_list>
+    <toolkit>cudaStreamDestroy</toolkit>
+    <toolkit>cudaMalloc</toolkit>
+    <toolkit>cudaMemcpyAsync</toolkit>
+    <toolkit>cudaFree</toolkit>
+    <toolkit>cudaMallocHost</toolkit>
+    <toolkit>cudaEventCreateWithFlags</toolkit>
+    <toolkit>cudaEventSynchronize</toolkit>
+    <toolkit>cudaEventRecord</toolkit>
+    <toolkit>cudaFreeHost</toolkit>
+    <toolkit>cudaGetDevice</toolkit>
+    <toolkit>cudaStreamWaitEvent</toolkit>
+    <toolkit>cudaEventDestroy</toolkit>
+    <toolkit>cudaEventElapsedTime</toolkit>
+    <toolkit>cudaStreamCreate</toolkit>
+    <toolkit>cudaGetDeviceProperties</toolkit>
+    <toolkit>cudaEventCreate</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../../Common</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="advanced">Performance Strategies</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>Concurrent Kernels</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>concurrentKernels.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Advanced Topics</scope>
+    <scope>1:Performance Strategies</scope>
+  </scopes>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm53</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sm-arch>sm80</sm-arch>
+  <sm-arch>sm86</sm-arch>
+  <sm-arch>sm87</sm-arch>
+  <sm-arch>sm89</sm-arch>
+  <sm-arch>sm90</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>sbsa</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Concurrent Kernels</title>
+</entry>
--- a/Samples/0_Introduction/concurrentKernels/README.md
+++ b/Samples/0_Introduction/concurrentKernels/README.md
@ -0,0 +1,70 @@
+# concurrentKernels - Concurrent Kernels
+
+## Description
+
+This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.
+
+## Key Concepts
+
+Performance Strategies
+
+## Supported SM Architectures
+
+[SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.9 ](https://developer.nvidia.com/cuda-gpus)  [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaStreamDestroy, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaMallocHost, cudaEventCreateWithFlags, cudaEventSynchronize, cudaEventRecord, cudaFreeHost, cudaGetDevice, cudaStreamWaitEvent, cudaEventDestroy, cudaEventElapsedTime, cudaStreamCreate, cudaGetDeviceProperties, cudaEventCreate
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 12.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels.cu
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels.cu
@ -0,0 +1,228 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+//
+// This sample demonstrates the use of streams for concurrent execution. It also
+// illustrates how to introduce dependencies between CUDA streams with the
+// cudaStreamWaitEvent function.
+//
+
+// Devices of compute capability 2.0 or higher can overlap the kernels
+//
+#include <cooperative_groups.h>
+#include <stdio.h>
+
+namespace cg = cooperative_groups;
+#include <helper_cuda.h>
+#include <helper_functions.h>
+
+// This is a kernel that does no real work but runs at least for a specified
+// number of clocks
+__global__ void clock_block(clock_t *d_o, clock_t clock_count) {
+  unsigned int start_clock = (unsigned int)clock();
+
+  clock_t clock_offset = 0;
+
+  while (clock_offset < clock_count) {
+    unsigned int end_clock = (unsigned int)clock();
+
+    // The code below should work like
+    // this (thanks to modular arithmetics):
+    //
+    // clock_offset = (clock_t) (end_clock > start_clock ?
+    //                           end_clock - start_clock :
+    //                           end_clock + (0xffffffffu - start_clock));
+    //
+    // Indeed, let m = 2^32 then
+    // end - start = end + m - start (mod m).
+
+    clock_offset = (clock_t)(end_clock - start_clock);
+  }
+
+  d_o[0] = clock_offset;
+}
+
+// Single warp reduction kernel
+__global__ void sum(clock_t *d_clocks, int N) {
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  __shared__ clock_t s_clocks[32];
+
+  clock_t my_sum = 0;
+
+  for (int i = threadIdx.x; i < N; i += blockDim.x) {
+    my_sum += d_clocks[i];
+  }
+
+  s_clocks[threadIdx.x] = my_sum;
+  cg::sync(cta);
+
+  for (int i = 16; i > 0; i /= 2) {
+    if (threadIdx.x < i) {
+      s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
+    }
+
+    cg::sync(cta);
+  }
+
+  d_clocks[0] = s_clocks[0];
+}
+
+int main(int argc, char **argv) {
+  int nkernels = 8;             // number of concurrent kernels
+  int nstreams = nkernels + 1;  // use one more stream than concurrent kernel
+  int nbytes = nkernels * sizeof(clock_t);  // number of data bytes
+  float kernel_time = 10;                   // time the kernel should run in ms
+  float elapsed_time;                       // timing variables
+  int cuda_device = 0;
+
+  printf("[%s] - Starting...\n", argv[0]);
+
+  // get number of kernels if overridden on the command line
+  if (checkCmdLineFlag(argc, (const char **)argv, "nkernels")) {
+    nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
+    nstreams = nkernels + 1;
+  }
+
+  // use command-line specified CUDA device, otherwise use device with highest
+  // Gflops/s
+  cuda_device = findCudaDevice(argc, (const char **)argv);
+
+  cudaDeviceProp deviceProp;
+  checkCudaErrors(cudaGetDevice(&cuda_device));
+
+  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
+
+  if ((deviceProp.concurrentKernels == 0)) {
+    printf("> GPU does not support concurrent kernel execution\n");
+    printf("  CUDA kernel runs will be serialized\n");
+  }
+
+  printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
+         deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+
+  // allocate host memory
+  clock_t *a = 0;  // pointer to the array data in host memory
+  checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
+
+  // allocate device memory
+  clock_t *d_a = 0;  // pointers to data and init value in the device memory
+  checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
+
+  // allocate and initialize an array of stream handles
+  cudaStream_t *streams =
+      (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
+
+  for (int i = 0; i < nstreams; i++) {
+    checkCudaErrors(cudaStreamCreate(&(streams[i])));
+  }
+
+  // create CUDA event handles
+  cudaEvent_t start_event, stop_event;
+  checkCudaErrors(cudaEventCreate(&start_event));
+  checkCudaErrors(cudaEventCreate(&stop_event));
+
+  // the events are used for synchronization only and hence do not need to
+  // record timings this also makes events not introduce global sync points when
+  // recorded which is critical to get overlap
+  cudaEvent_t *kernelEvent;
+  kernelEvent = (cudaEvent_t *)malloc(nkernels * sizeof(cudaEvent_t));
+
+  for (int i = 0; i < nkernels; i++) {
+    checkCudaErrors(
+        cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  // time execution with nkernels streams
+  clock_t total_clocks = 0;
+#if defined(__arm__) || defined(__aarch64__)
+  // the kernel takes more time than the channel reset time on arm archs, so to
+  // prevent hangs reduce time_clocks.
+  clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100));
+#else
+  clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
+#endif
+
+  cudaEventRecord(start_event, 0);
+
+  // queue nkernels in separate streams and record when they are done
+  for (int i = 0; i < nkernels; ++i) {
+    clock_block<<<1, 1, 0, streams[i]>>>(&d_a[i], time_clocks);
+    total_clocks += time_clocks;
+    checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
+
+    // make the last stream wait for the kernel event to be recorded
+    checkCudaErrors(
+        cudaStreamWaitEvent(streams[nstreams - 1], kernelEvent[i], 0));
+  }
+
+  // queue a sum kernel and a copy back to host in the last stream.
+  // the commands in this stream get dispatched as soon as all the kernel events
+  // have been recorded
+  sum<<<1, 32, 0, streams[nstreams - 1]>>>(d_a, nkernels);
+  checkCudaErrors(cudaMemcpyAsync(
+      a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams - 1]));
+
+  // at this point the CPU has dispatched all work for the GPU and can continue
+  // processing other tasks in parallel
+
+  // in this sample we just wait until the GPU is done
+  checkCudaErrors(cudaEventRecord(stop_event, 0));
+  checkCudaErrors(cudaEventSynchronize(stop_event));
+  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
+
+  printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels,
+         nkernels * kernel_time / 1000.0f);
+  printf("Expected time for concurrent execution of %d kernels = %.3fs\n",
+         nkernels, kernel_time / 1000.0f);
+  printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
+
+  bool bTestResult = (a[0] > total_clocks);
+
+  // release resources
+  for (int i = 0; i < nkernels; i++) {
+    cudaStreamDestroy(streams[i]);
+    cudaEventDestroy(kernelEvent[i]);
+  }
+
+  free(streams);
+  free(kernelEvent);
+
+  cudaEventDestroy(start_event);
+  cudaEventDestroy(stop_event);
+  cudaFreeHost(a);
+  cudaFree(d_a);
+
+  if (!bTestResult) {
+    printf("Test failed!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  printf("Test passed\n");
+  exit(EXIT_SUCCESS);
+}
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2017.sln
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2017.vcxproj
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2017.vcxproj
@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>concurrentKernels_vs2017</RootNamespace>
+    <ProjectName>concurrentKernels</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
+    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
+    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="concurrentKernels.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2019.sln
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2019.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2019
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2019.vcxproj
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2019.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>concurrentKernels_vs2019</RootNamespace>
+    <ProjectName>concurrentKernels</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="concurrentKernels.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2022.sln
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2022.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2022
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2022.vcxproj
+++ b/Samples/0_Introduction/concurrentKernels/concurrentKernels_vs2022.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>concurrentKernels_vs2022</RootNamespace>
+    <ProjectName>concurrentKernels</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v143</PlatformToolset>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;compute_89,sm_89;compute_90,sm_90;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819"  --threads 0 </AdditionalOptions>
+      <Include>./;../../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="concurrentKernels.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 12.1.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/.vscode/c_cpp_properties.json
+++ b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/.vscode/c_cpp_properties.json
--- a/Show More
+++ b/Show More