From b7c5481c556c3fe98db060207ecaa41a4b9a9abc Mon Sep 17 00:00:00 2001
From: Dheemanth <dheemanthmanur72@gmail.com>
Date: Wed, 27 May 2026 14:50:59 -0700
Subject: [PATCH] Release v13.3 of the CUDA samples with CUDA 13.3 Toolkit
 (#435)

This is the release of the CUDA 13.3 samples, which include additions for CUDA Tile C++, and updated CCCL and Python samples.
---
 .gitignore                                    |    5 +
 CHANGELOG.md                                  |   13 +
 Common/helper_string.h                        |    8 +-
 README.md                                     |   11 +-
 cmake/CPM.cmake                               | 1297 +++++++++++++++++
 cpp/4_CUDA_Libraries/CMakeLists.txt           |    5 +
 .../cubDeviceFind/CMakeLists.txt              |   62 +
 cpp/4_CUDA_Libraries/cubDeviceFind/README.md  |   48 +
 .../cubDeviceFind/cubDeviceFind.cu            |  248 ++++
 .../cubDeviceSegmentedScan/CMakeLists.txt     |   63 +
 .../cubDeviceSegmentedScan/README.md          |   48 +
 .../cubDeviceSegmentedScan.cu                 |  188 +++
 .../cubDeviceTransform/CMakeLists.txt         |   62 +
 .../cubDeviceTransform/README.md              |   48 +
 .../cubDeviceTransform/cubDeviceTransform.cu  |  155 ++
 .../libcuxxMdspan/CMakeLists.txt              |   92 ++
 cpp/4_CUDA_Libraries/libcuxxMdspan/README.md  |   44 +
 .../libcuxxMdspan/libcuxxMdspan.cu            |  246 ++++
 .../libcuxxRandom/CMakeLists.txt              |   65 +
 cpp/4_CUDA_Libraries/libcuxxRandom/README.md  |   44 +
 .../libcuxxRandom/libcuxxRandom.cu            |  180 +++
 .../Tegra/fluidsGLES/fluidsGLES.cpp           |    2 -
 cpp/9_CUDA_Tile/Benchmark_Common/benchmark.h  |  206 +++
 .../Benchmark_Common/matmul_benchmark.h       |   96 ++
 cpp/9_CUDA_Tile/CMakeLists.txt                |   17 +
 cpp/9_CUDA_Tile/README.md                     |   74 +
 cpp/9_CUDA_Tile/helloTile/CMakeLists.txt      |   29 +
 cpp/9_CUDA_Tile/helloTile/README.md           |   31 +
 cpp/9_CUDA_Tile/helloTile/helloTile.cu        |   76 +
 cpp/9_CUDA_Tile/tileBmm/CMakeLists.txt        |   29 +
 cpp/9_CUDA_Tile/tileBmm/README.md             |   28 +
 cpp/9_CUDA_Tile/tileBmm/tileBmm.cu            |  268 ++++
 cpp/9_CUDA_Tile/tileLayerNorm/CMakeLists.txt  |   29 +
 cpp/9_CUDA_Tile/tileLayerNorm/README.md       |   26 +
 .../tileLayerNorm/tileLayerNorm.cu            |  270 ++++
 cpp/9_CUDA_Tile/tileMatmul/CMakeLists.txt     |   29 +
 cpp/9_CUDA_Tile/tileMatmul/README.md          |   55 +
 cpp/9_CUDA_Tile/tileMatmul/tileMatmul.cu      |  282 ++++
 .../tileMatmulAutotuner/CMakeLists.txt        |   67 +
 cpp/9_CUDA_Tile/tileMatmulAutotuner/README.md |   59 +
 .../autotuner_search_space.conf               |   15 +
 .../tileMatmulAutotuner/backend_common.h      |  314 ++++
 .../tileMatmulAutotuner/backend_nvcc.h        |  111 ++
 .../tileMatmulAutotuner/backend_nvrtc.h       |  186 +++
 cpp/9_CUDA_Tile/tileMatmulAutotuner/matmul.cu |  101 ++
 .../tileMatmulAutotuner/matmul_autotuner.cpp  |  331 +++++
 cpp/9_CUDA_Tile/tileRope/CMakeLists.txt       |   29 +
 cpp/9_CUDA_Tile/tileRope/README.md            |   29 +
 cpp/9_CUDA_Tile/tileRope/tileRope.cu          |  274 ++++
 cpp/9_CUDA_Tile/tileSpMV/CMakeLists.txt       |   29 +
 cpp/9_CUDA_Tile/tileSpMV/README.md            |   44 +
 cpp/9_CUDA_Tile/tileSpMV/tileSpMV.cu          |  494 +++++++
 cpp/9_CUDA_Tile/tileTranspose/CMakeLists.txt  |   29 +
 cpp/9_CUDA_Tile/tileTranspose/README.md       |   21 +
 .../tileTranspose/tileTranspose.cu            |  126 ++
 cpp/9_CUDA_Tile/tileVectorAdd/CMakeLists.txt  |   29 +
 cpp/9_CUDA_Tile/tileVectorAdd/README.md       |   24 +
 .../tileVectorAdd/tileVectorAdd.cu            |  136 ++
 cpp/CMakeLists.txt                            |    3 +
 .../blurImageUnifiedMemory/README.md          |   11 +-
 .../blurImageUnifiedMemory.py                 |   12 +-
 .../blurImageUnifiedMemory/requirements.txt   |    2 +-
 .../copyImageArraytoGPU/README.md             |   10 +-
 .../copyImageArraytoGPU.py                    |    2 +-
 .../copyImageArraytoGPU/requirements.txt      |    4 +-
 python/1_GettingStarted/deviceQuery/README.md |    4 +-
 .../deviceQuery/deviceQuery.py                |    2 +-
 .../deviceQuery/requirements.txt              |    2 +-
 .../kernelNsysProfile/requirements.txt        |    4 +-
 .../numpyVsCupy/numpyVsCupy.py                |    2 +-
 .../numpyVsCupy/requirements.txt              |    4 +-
 python/1_GettingStarted/simplePrint/README.md |    2 +-
 .../simplePrint/requirements.txt              |    4 +-
 python/1_GettingStarted/systemInfo/README.md  |    4 +-
 .../systemInfo/requirements.txt               |    2 +-
 .../1_GettingStarted/systemInfo/systemInfo.py |   26 +-
 python/1_GettingStarted/vectorAdd/README.md   |    8 +-
 .../vectorAdd/requirements.txt                |    4 +-
 python/2_CoreConcepts/binarySearch/README.md  |  129 ++
 .../binarySearch/binarySearch.py              |  147 ++
 .../binarySearch/requirements.txt             |    4 +
 .../blockwiseSum/blockwiseSum.py              |    4 +-
 .../blockwiseSum/requirements.txt             |    4 +-
 .../cudaComputeLambdas/README.md              |  130 ++
 .../cudaComputeLambdas/cudaComputeLambdas.py  |  179 +++
 .../cudaComputeLambdas/requirements.txt       |    4 +
 python/2_CoreConcepts/cudaGraphs/README.md    |    8 +-
 .../2_CoreConcepts/cudaGraphs/cudaGraphs.py   |   14 +-
 .../cudaGraphs/requirements.txt               |    4 +-
 .../fftSignalAnalysis/README.md               |    6 +-
 .../fftSignalAnalysis/fftSignalAnalysis.py    |    6 +-
 .../fftSignalAnalysis/requirements.txt        |    4 +-
 .../greenContext/greenContext.py              |    6 +-
 .../greenContext/requirements.txt             |    2 +-
 python/2_CoreConcepts/jitLtoLinking/README.md |    8 +-
 .../jitLtoLinking/jitLtoLinking.py            |   10 +-
 .../jitLtoLinking/requirements.txt            |    4 +-
 .../launchConfigTuning/README.md              |   19 +-
 .../launchConfigTuning/launchConfigTuning.py  |   24 +-
 .../launchConfigTuning/requirements.txt       |    2 +-
 .../matrixMulSharedMem/README.md              |   22 +-
 .../matrixMulSharedMem/matrixMulSharedMem.py  |   11 +-
 .../matrixMulSharedMem/requirements.txt       |   19 +-
 .../2_CoreConcepts/memoryResources/README.md  |   22 +-
 .../memoryResources/memoryResources.py        |   14 +-
 .../memoryResources/requirements.txt          |    4 +-
 python/2_CoreConcepts/pageRank/README.md      |   30 +-
 python/2_CoreConcepts/pageRank/pageRank.py    |   24 +-
 .../2_CoreConcepts/pageRank/requirements.txt  |   11 +-
 .../parallelHistogram/parallelHistogram.py    |    2 +-
 .../parallelHistogram/requirements.txt        |    4 +-
 .../parallelReduction/README.md               |    4 +-
 .../parallelReduction/parallelReduction.py    |    8 +-
 .../parallelReduction/requirements.txt        |    4 +-
 python/2_CoreConcepts/prefixSum/README.md     |   10 +-
 python/2_CoreConcepts/prefixSum/prefixSum.py  |    6 +-
 .../2_CoreConcepts/prefixSum/requirements.txt |    4 +-
 .../processCheckpoint/processCheckpoint.py    |   13 +
 .../processCheckpoint/requirements.txt        |    5 +-
 python/2_CoreConcepts/reduction/README.md     |    2 +-
 python/2_CoreConcepts/reduction/reduction.py  |    6 +-
 .../2_CoreConcepts/reduction/requirements.txt |    2 +-
 .../reductionMultiBlockCG/README.md           |   21 +-
 .../reductionMultiBlockCG.py                  |   23 +-
 .../reductionMultiBlockCG/requirements.txt    |    2 +-
 .../2_CoreConcepts/simpleZeroCopy/README.md   |    2 +-
 .../simpleZeroCopy/requirements.txt           |    2 +-
 .../streamingCopyComputeOverlap/README.md     |    2 +-
 .../requirements.txt                          |    2 +-
 .../streamingCopyComputeOverlap.py            |   22 +-
 python/2_CoreConcepts/tmaTensorMap/README.md  |    8 +-
 .../tmaTensorMap/requirements.txt             |    4 +-
 .../tmaTensorMap/tmaTensorMap.py              |   14 +-
 .../customPyTorchKernel/README.md             |   10 +-
 .../customPyTorchKernel/requirements.txt      |    7 +-
 .../customTensorFlowKernel/README.md          |    2 +-
 .../customTensorFlowKernel/requirements.txt   |    6 +-
 .../ipcMemoryPool/README.md                   |    8 +-
 .../ipcMemoryPool/ipcMemoryPool.py            |    2 +-
 .../ipcMemoryPool/requirements.txt            |    4 +-
 .../multiGPUGradientAverage/README.md         |   30 +-
 .../multiGPUGradientAverage.py                |    4 +-
 .../multiGPUGradientAverage/requirements.txt  |    4 +-
 .../simpleP2P/README.md                       |    2 +-
 .../simpleP2P/requirements.txt                |    2 +-
 .../simpleP2P/simpleP2P.py                    |   12 +-
 python/Utilities/README.md                    |    4 +-
 python/requirements.txt                       |    4 +-
 run_tests.py                                  |   25 +-
 test_args.json                                |   11 +
 150 files changed, 7977 insertions(+), 246 deletions(-)
 create mode 100644 cmake/CPM.cmake
 create mode 100644 cpp/4_CUDA_Libraries/cubDeviceFind/CMakeLists.txt
 create mode 100644 cpp/4_CUDA_Libraries/cubDeviceFind/README.md
 create mode 100644 cpp/4_CUDA_Libraries/cubDeviceFind/cubDeviceFind.cu
 create mode 100644 cpp/4_CUDA_Libraries/cubDeviceSegmentedScan/CMakeLists.txt
 create mode 100644 cpp/4_CUDA_Libraries/cubDeviceSegmentedScan/README.md
 create mode 100644 cpp/4_CUDA_Libraries/cubDeviceSegmentedScan/cubDeviceSegmentedScan.cu
 create mode 100644 cpp/4_CUDA_Libraries/cubDeviceTransform/CMakeLists.txt
 create mode 100644 cpp/4_CUDA_Libraries/cubDeviceTransform/README.md
 create mode 100644 cpp/4_CUDA_Libraries/cubDeviceTransform/cubDeviceTransform.cu
 create mode 100644 cpp/4_CUDA_Libraries/libcuxxMdspan/CMakeLists.txt
 create mode 100644 cpp/4_CUDA_Libraries/libcuxxMdspan/README.md
 create mode 100644 cpp/4_CUDA_Libraries/libcuxxMdspan/libcuxxMdspan.cu
 create mode 100644 cpp/4_CUDA_Libraries/libcuxxRandom/CMakeLists.txt
 create mode 100644 cpp/4_CUDA_Libraries/libcuxxRandom/README.md
 create mode 100644 cpp/4_CUDA_Libraries/libcuxxRandom/libcuxxRandom.cu
 create mode 100644 cpp/9_CUDA_Tile/Benchmark_Common/benchmark.h
 create mode 100644 cpp/9_CUDA_Tile/Benchmark_Common/matmul_benchmark.h
 create mode 100644 cpp/9_CUDA_Tile/CMakeLists.txt
 create mode 100644 cpp/9_CUDA_Tile/README.md
 create mode 100644 cpp/9_CUDA_Tile/helloTile/CMakeLists.txt
 create mode 100644 cpp/9_CUDA_Tile/helloTile/README.md
 create mode 100644 cpp/9_CUDA_Tile/helloTile/helloTile.cu
 create mode 100644 cpp/9_CUDA_Tile/tileBmm/CMakeLists.txt
 create mode 100644 cpp/9_CUDA_Tile/tileBmm/README.md
 create mode 100644 cpp/9_CUDA_Tile/tileBmm/tileBmm.cu
 create mode 100644 cpp/9_CUDA_Tile/tileLayerNorm/CMakeLists.txt
 create mode 100644 cpp/9_CUDA_Tile/tileLayerNorm/README.md
 create mode 100644 cpp/9_CUDA_Tile/tileLayerNorm/tileLayerNorm.cu
 create mode 100644 cpp/9_CUDA_Tile/tileMatmul/CMakeLists.txt
 create mode 100644 cpp/9_CUDA_Tile/tileMatmul/README.md
 create mode 100644 cpp/9_CUDA_Tile/tileMatmul/tileMatmul.cu
 create mode 100644 cpp/9_CUDA_Tile/tileMatmulAutotuner/CMakeLists.txt
 create mode 100644 cpp/9_CUDA_Tile/tileMatmulAutotuner/README.md
 create mode 100644 cpp/9_CUDA_Tile/tileMatmulAutotuner/autotuner_search_space.conf
 create mode 100644 cpp/9_CUDA_Tile/tileMatmulAutotuner/backend_common.h
 create mode 100644 cpp/9_CUDA_Tile/tileMatmulAutotuner/backend_nvcc.h
 create mode 100644 cpp/9_CUDA_Tile/tileMatmulAutotuner/backend_nvrtc.h
 create mode 100644 cpp/9_CUDA_Tile/tileMatmulAutotuner/matmul.cu
 create mode 100644 cpp/9_CUDA_Tile/tileMatmulAutotuner/matmul_autotuner.cpp
 create mode 100644 cpp/9_CUDA_Tile/tileRope/CMakeLists.txt
 create mode 100644 cpp/9_CUDA_Tile/tileRope/README.md
 create mode 100644 cpp/9_CUDA_Tile/tileRope/tileRope.cu
 create mode 100644 cpp/9_CUDA_Tile/tileSpMV/CMakeLists.txt
 create mode 100644 cpp/9_CUDA_Tile/tileSpMV/README.md
 create mode 100644 cpp/9_CUDA_Tile/tileSpMV/tileSpMV.cu
 create mode 100644 cpp/9_CUDA_Tile/tileTranspose/CMakeLists.txt
 create mode 100644 cpp/9_CUDA_Tile/tileTranspose/README.md
 create mode 100644 cpp/9_CUDA_Tile/tileTranspose/tileTranspose.cu
 create mode 100644 cpp/9_CUDA_Tile/tileVectorAdd/CMakeLists.txt
 create mode 100644 cpp/9_CUDA_Tile/tileVectorAdd/README.md
 create mode 100644 cpp/9_CUDA_Tile/tileVectorAdd/tileVectorAdd.cu
 create mode 100644 python/2_CoreConcepts/binarySearch/README.md
 create mode 100644 python/2_CoreConcepts/binarySearch/binarySearch.py
 create mode 100644 python/2_CoreConcepts/binarySearch/requirements.txt
 create mode 100644 python/2_CoreConcepts/cudaComputeLambdas/README.md
 create mode 100644 python/2_CoreConcepts/cudaComputeLambdas/cudaComputeLambdas.py
 create mode 100644 python/2_CoreConcepts/cudaComputeLambdas/requirements.txt

diff --git a/.gitignore b/.gitignore
index 4b782397..398bd275 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,11 @@
 build
+build-*/
+test-results*/
 .vs
 .clangd
 test
 settings.json
 launch.json
+__pycache__/
+*.py[co]
+.pytest_cache/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5e604cef..7ddb8902 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,18 @@
 ## Changelog
 
+### CUDA 13.3
+* Added **CUDA Tile C++** samples under `cpp/9_CUDA_Tile`.
+* Added a set of **CCCL 3.3 feature samples** under `cpp/4_CUDA_Libraries/`, each built against CCCL fetched via CPM (pinned to v3.3.3, with an optional `CCCL_SOURCE_DIR` override):
+    * `cubDeviceFind` - `cub::DeviceFind::FindIf`, `LowerBound`, and `UpperBound` device-wide search algorithms.
+    * `cubDeviceSegmentedScan` - `cub::DeviceSegmentedScan::ExclusiveSegmentedSum` and `InclusiveSegmentedScan` with a custom binary operator.
+    * `cubDeviceTransform` - N-to-M `cub::DeviceTransform::Transform` where the op returns a `cuda::std::tuple`.
+    * `libcuxxRandom` - `cuda::pcg64` and `cuda::std::philox4x32` engines driving the uniform, normal, Poisson, and Bernoulli distributions from `<cuda/std/random>`.
+    * `libcuxxMdspan` - DLPack <-> `cuda::std::mdspan` bridging via `cuda::to_device_mdspan` / `cuda::to_dlpack_tensor`, plus `cuda::shared_memory_mdspan` for multi-dimensional views of shared memory.
+* Added **cuda.compute 1.0 Python samples** under `python/2_CoreConcepts/`:
+    * `cudaComputeLambdas` - Python lambdas / regular callables driving `reduce_into`, `unary_transform`, and `inclusive_scan` in `cuda.compute` (from the `cuda-cccl` package).
+    * `binarySearch` - parallel `cuda.compute.upper_bound` / `lower_bound`, verified against `numpy.searchsorted`.
+
+
 ### CUDA 13.2 (update)
 * Added **CUDA Python samples** under `python/`. These scripts use [CUDA Python](https://nvidia.github.io/cuda-python/) (including `cuda.core`) and are organized like the C++ tree: `1_GettingStarted`, `2_CoreConcepts`, `3_FrameworkInterop`, and `4_DistributedComputing`, plus shared helpers in `python/Utilities`. Each sample includes a `README.md` and `requirements.txt`. They are **not** built by the root CMake project; install dependencies with `pip install -r requirements.txt` in the sample directory, then run the corresponding `.py` file as documented in that sample’s README.
 * Renamed top-level `Samples` directory to `cpp` to accommodate Python samples alongside existing C++ samples; updated path references in `CMakeLists.txt`, `README.md`, and `Common` headers accordingly.
diff --git a/Common/helper_string.h b/Common/helper_string.h
index 89a3fce0..7413466c 100644
--- a/Common/helper_string.h
+++ b/Common/helper_string.h
@@ -354,7 +354,13 @@ inline char *sdkFindFilePath(const char *filename,
 
       "../../../../Common/data/",                     // up 4 in tree
       "../../../Common/data/",                        // up 3 in tree
-      "../../Common/data/"                            // up 2 in tree
+      "../../Common/data/",                           // up 2 in tree
+
+      "../../../../cpp/9_CUDA_Tile/<executable_name>/",        // up 4 in tree
+      "../../../cpp/9_CUDA_Tile/<executable_name>/",           // up 3 in tree
+      "../../cpp/9_CUDA_Tile/<executable_name>/",              // up 2 in tree
+      "../cpp/9_CUDA_Tile/<executable_name>/",                 // up 1 in tree
+      "./cpp/9_CUDA_Tile/<executable_name>/"                   // up 0 in tree
   };
 
   // Extract the executable name
diff --git a/README.md b/README.md
index a0785858..106d3f73 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # CUDA Samples
 
-Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 13.2](https://developer.nvidia.com/cuda-downloads).
+Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 13.3](https://developer.nvidia.com/cuda-downloads).
 
 ## Release Notes
 
@@ -181,10 +181,10 @@ QNX_HOST=/path/to/qnx/host \
 QNX_TARGET=/path/to/qnx/target \
 cmake .. \
 -DBUILD_TEGRA=True \
--DCMAKE_CUDA_COMPILER=/usr/local/cuda-safe-13.0/bin/nvcc \
+-DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.3/bin/nvcc \
 -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/toolchain-aarch64-qnx.cmake \
--DCMAKE_LIBRARY_PATH=/usr/local/cuda-safe-13.0/thor/targets/aarch64-qnx/lib/stubs/ \
--DCMAKE_INCLUDE_PATH=/usr/local/cuda-safe-13.0/thor/targets/aarch64-qnx/include/
+-DCMAKE_LIBRARY_PATH=/usr/local/cuda-13.3/thor/targets/aarch64-qnx/lib/stubs/ \
+-DCMAKE_INCLUDE_PATH=/usr/local/cuda-13.3/thor/targets/aarch64-qnx/include/
 ```
 
 ### Forward Compatibility
@@ -476,6 +476,9 @@ Samples that demonstrate the use of libNVVVM and NVVM IR.
 ### [8. Platform Specific](./cpp/8_Platform_Specific/Tegra/README.md)
 Samples that are specific to certain platforms (Tegra, cuDLA, NvMedia, NvSci, OpenGL ES).
 
+### [9. CUDA Tile](./cpp/9_CUDA_Tile/README.md)
+Samples that demonstrate how to use CUDA Tile C++.
+
 ## Dependencies
 
 Some CUDA Samples rely on third-party applications and/or libraries, or features provided by the CUDA Toolkit and Driver, to either build or execute. These dependencies are listed below.
diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake
new file mode 100644
index 00000000..92d0bc6d
--- /dev/null
+++ b/cmake/CPM.cmake
@@ -0,0 +1,1297 @@
+# CPM.cmake - CMake's missing package manager
+# ===========================================
+# See https://github.com/cpm-cmake/CPM.cmake for usage and update instructions.
+#
+# MIT License
+# -----------
+#[[
+  Copyright (c) 2019-2023 Lars Melchior and contributors
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in all
+  copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
+]]
+
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+# Initialize logging prefix
+if (NOT CPM_INDENT)
+  set(CPM_INDENT "CPM:" CACHE INTERNAL "")
+endif()
+
+if (NOT COMMAND cpm_message)
+  function(cpm_message)
+    message(${ARGV})
+  endfunction()
+endif()
+
+set(CURRENT_CPM_VERSION 0.40.2)
+
+get_filename_component(
+  CPM_CURRENT_DIRECTORY
+  "${CMAKE_CURRENT_LIST_DIR}"
+  REALPATH
+)
+if (CPM_DIRECTORY)
+  if (NOT CPM_DIRECTORY STREQUAL CPM_CURRENT_DIRECTORY)
+    if (CPM_VERSION VERSION_LESS CURRENT_CPM_VERSION)
+      message(
+        AUTHOR_WARNING
+        "${CPM_INDENT} \
+A dependency is using a more recent CPM version (${CURRENT_CPM_VERSION}) than the current project (${CPM_VERSION}). \
+It is recommended to upgrade CPM to the most recent version. \
+See https://github.com/cpm-cmake/CPM.cmake for more information."
+      )
+    endif()
+    if (${CMAKE_VERSION} VERSION_LESS "3.17.0")
+      include(FetchContent)
+    endif()
+    return()
+  endif()
+
+  get_property(CPM_INITIALIZED GLOBAL "" PROPERTY CPM_INITIALIZED SET)
+  if (CPM_INITIALIZED)
+    return()
+  endif()
+endif()
+
+if (CURRENT_CPM_VERSION MATCHES "development-version")
+  message(
+    WARNING
+    "${CPM_INDENT} Your project is using an unstable development version of CPM.cmake. \
+Please update to a recent release if possible. \
+See https://github.com/cpm-cmake/CPM.cmake for details."
+  )
+endif()
+
+set_property(GLOBAL PROPERTY CPM_INITIALIZED true)
+
+macro(cpm_set_policies)
+  # the policy allows us to change options without caching
+  cmake_policy(SET CMP0077 NEW)
+  set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+  # the policy allows us to change set(CACHE) without caching
+  if (POLICY CMP0126)
+    cmake_policy(SET CMP0126 NEW)
+    set(CMAKE_POLICY_DEFAULT_CMP0126 NEW)
+  endif()
+
+  # The policy uses the download time for timestamp, instead of the timestamp in the archive. This
+  # allows for proper rebuilds when a projects url changes
+  if (POLICY CMP0135)
+    cmake_policy(SET CMP0135 NEW)
+    set(CMAKE_POLICY_DEFAULT_CMP0135 NEW)
+  endif()
+
+  # treat relative git repository paths as being relative to the parent project's remote
+  if (POLICY CMP0150)
+    cmake_policy(SET CMP0150 NEW)
+    set(CMAKE_POLICY_DEFAULT_CMP0150 NEW)
+  endif()
+endmacro()
+cpm_set_policies()
+
+option(
+  CPM_USE_LOCAL_PACKAGES
+  "Always try to use `find_package` to get dependencies"
+  $ENV{CPM_USE_LOCAL_PACKAGES}
+)
+option(
+  CPM_LOCAL_PACKAGES_ONLY
+  "Only use `find_package` to get dependencies"
+  $ENV{CPM_LOCAL_PACKAGES_ONLY}
+)
+option(
+  CPM_DOWNLOAD_ALL
+  "Always download dependencies from source"
+  $ENV{CPM_DOWNLOAD_ALL}
+)
+option(
+  CPM_DONT_UPDATE_MODULE_PATH
+  "Don't update the module path to allow using find_package"
+  $ENV{CPM_DONT_UPDATE_MODULE_PATH}
+)
+option(
+  CPM_DONT_CREATE_PACKAGE_LOCK
+  "Don't create a package lock file in the binary path"
+  $ENV{CPM_DONT_CREATE_PACKAGE_LOCK}
+)
+option(
+  CPM_INCLUDE_ALL_IN_PACKAGE_LOCK
+  "Add all packages added through CPM.cmake to the package lock"
+  $ENV{CPM_INCLUDE_ALL_IN_PACKAGE_LOCK}
+)
+option(
+  CPM_USE_NAMED_CACHE_DIRECTORIES
+  "Use additional directory of package name in cache on the most nested level."
+  $ENV{CPM_USE_NAMED_CACHE_DIRECTORIES}
+)
+
+set(CPM_VERSION ${CURRENT_CPM_VERSION} CACHE INTERNAL "")
+set(CPM_DIRECTORY ${CPM_CURRENT_DIRECTORY} CACHE INTERNAL "")
+set(CPM_FILE ${CMAKE_CURRENT_LIST_FILE} CACHE INTERNAL "")
+set(CPM_PACKAGES "" CACHE INTERNAL "")
+set(
+  CPM_DRY_RUN
+  OFF
+  CACHE INTERNAL
+  "Don't download or configure dependencies (for testing)"
+)
+
+if (DEFINED ENV{CPM_SOURCE_CACHE})
+  set(CPM_SOURCE_CACHE_DEFAULT $ENV{CPM_SOURCE_CACHE})
+else()
+  set(CPM_SOURCE_CACHE_DEFAULT OFF)
+endif()
+
+set(
+  CPM_SOURCE_CACHE
+  ${CPM_SOURCE_CACHE_DEFAULT}
+  CACHE PATH
+  "Directory to download CPM dependencies"
+)
+
+if (NOT CPM_DONT_UPDATE_MODULE_PATH)
+  set(CPM_MODULE_PATH "${CMAKE_BINARY_DIR}/CPM_modules" CACHE INTERNAL "")
+  # remove old modules
+  file(REMOVE_RECURSE ${CPM_MODULE_PATH})
+  file(MAKE_DIRECTORY ${CPM_MODULE_PATH})
+  # locally added CPM modules should override global packages
+  set(CMAKE_MODULE_PATH "${CPM_MODULE_PATH};${CMAKE_MODULE_PATH}")
+endif()
+
+if (NOT CPM_DONT_CREATE_PACKAGE_LOCK)
+  set(
+    CPM_PACKAGE_LOCK_FILE
+    "${CMAKE_BINARY_DIR}/cpm-package-lock.cmake"
+    CACHE INTERNAL
+    ""
+  )
+  file(
+    WRITE ${CPM_PACKAGE_LOCK_FILE}
+    "# CPM Package Lock\n# This file should be committed to version control\n\n"
+  )
+endif()
+
+include(FetchContent)
+
+# Try to infer package name from git repository uri (path or url)
+function(cpm_package_name_from_git_uri URI RESULT)
+  if ("${URI}" MATCHES "([^/:]+)/?.git/?$")
+    set(${RESULT} ${CMAKE_MATCH_1} PARENT_SCOPE)
+  else()
+    unset(${RESULT} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Try to infer package name and version from a url
+function(cpm_package_name_and_ver_from_url url outName outVer)
+  if (
+    url
+      MATCHES
+      "[/\\?]([a-zA-Z0-9_\\.-]+)\\.(tar|tar\\.gz|tar\\.bz2|zip|ZIP)(\\?|/|$)"
+  )
+    # We matched an archive
+    set(filename "${CMAKE_MATCH_1}")
+
+    if (
+      filename
+        MATCHES
+        "([a-zA-Z0-9_\\.-]+)[_-]v?(([0-9]+\\.)*[0-9]+[a-zA-Z0-9]*)"
+    )
+      # We matched <name>-<version> (ie foo-1.2.3)
+      set(${outName} "${CMAKE_MATCH_1}" PARENT_SCOPE)
+      set(${outVer} "${CMAKE_MATCH_2}" PARENT_SCOPE)
+    elseif (filename MATCHES "(([0-9]+\\.)+[0-9]+[a-zA-Z0-9]*)")
+      # We couldn't find a name, but we found a version
+      #
+      # In many cases (which we don't handle here) the url would look something like
+      # `irrelevant/ACTUAL_PACKAGE_NAME/irrelevant/1.2.3.zip`. In such a case we can't possibly
+      # distinguish the package name from the irrelevant bits. Moreover if we try to match the
+      # package name from the filename, we'd get bogus at best.
+      unset(${outName} PARENT_SCOPE)
+      set(${outVer} "${CMAKE_MATCH_1}" PARENT_SCOPE)
+    else()
+      # Boldly assume that the file name is the package name.
+      #
+      # Yes, something like `irrelevant/ACTUAL_NAME/irrelevant/download.zip` will ruin our day, but
+      # such cases should be quite rare. No popular service does this... we think.
+      set(${outName} "${filename}" PARENT_SCOPE)
+      unset(${outVer} PARENT_SCOPE)
+    endif()
+  else()
+    # No ideas yet what to do with non-archives
+    unset(${outName} PARENT_SCOPE)
+    unset(${outVer} PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(cpm_find_package NAME VERSION)
+  string(REPLACE " " ";" EXTRA_ARGS "${ARGN}")
+  find_package(${NAME} ${VERSION} ${EXTRA_ARGS} QUIET)
+  if (${CPM_ARGS_NAME}_FOUND)
+    if (DEFINED ${CPM_ARGS_NAME}_VERSION)
+      set(VERSION ${${CPM_ARGS_NAME}_VERSION})
+    endif()
+    cpm_message(
+      STATUS
+      "${CPM_INDENT} Using local package ${CPM_ARGS_NAME}@${VERSION}"
+    )
+    CPMRegisterPackage(${CPM_ARGS_NAME} "${VERSION}")
+    set(CPM_PACKAGE_FOUND YES PARENT_SCOPE)
+  else()
+    set(CPM_PACKAGE_FOUND NO PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Create a custom FindXXX.cmake module for a CPM package This prevents `find_package(NAME)` from
+# finding the system library
+function(cpm_create_module_file Name)
+  if (NOT CPM_DONT_UPDATE_MODULE_PATH)
+    # erase any previous modules
+    file(
+      WRITE ${CPM_MODULE_PATH}/Find${Name}.cmake
+      "include(\"${CPM_FILE}\")\n${ARGN}\nset(${Name}_FOUND TRUE)"
+    )
+  endif()
+endfunction()
+
+# Find a package locally or fallback to CPMAddPackage
+function(CPMFindPackage)
+  set(oneValueArgs NAME VERSION GIT_TAG FIND_PACKAGE_ARGUMENTS)
+
+  cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "" ${ARGN})
+
+  if (NOT DEFINED CPM_ARGS_VERSION)
+    if (DEFINED CPM_ARGS_GIT_TAG)
+      cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION)
+    endif()
+  endif()
+
+  set(downloadPackage ${CPM_DOWNLOAD_ALL})
+  if (DEFINED CPM_DOWNLOAD_${CPM_ARGS_NAME})
+    set(downloadPackage ${CPM_DOWNLOAD_${CPM_ARGS_NAME}})
+  elseif (DEFINED ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}})
+    set(downloadPackage $ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}})
+  endif()
+  if (downloadPackage)
+    CPMAddPackage(${ARGN})
+    cpm_export_variables(${CPM_ARGS_NAME})
+    return()
+  endif()
+
+  cpm_find_package(
+    ${CPM_ARGS_NAME}
+    "${CPM_ARGS_VERSION}"
+    ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS}
+  )
+
+  if (NOT CPM_PACKAGE_FOUND)
+    CPMAddPackage(${ARGN})
+    cpm_export_variables(${CPM_ARGS_NAME})
+  endif()
+endfunction()
+
+# checks if a package has been added before
+function(cpm_check_if_package_already_added CPM_ARGS_NAME CPM_ARGS_VERSION)
+  if ("${CPM_ARGS_NAME}" IN_LIST CPM_PACKAGES)
+    CPMGetPackageVersion(${CPM_ARGS_NAME} CPM_PACKAGE_VERSION)
+    if ("${CPM_PACKAGE_VERSION}" VERSION_LESS "${CPM_ARGS_VERSION}")
+      message(
+        WARNING
+        "${CPM_INDENT} Requires a newer version of ${CPM_ARGS_NAME} (${CPM_ARGS_VERSION}) than currently included (${CPM_PACKAGE_VERSION})."
+      )
+    endif()
+    cpm_get_fetch_properties(${CPM_ARGS_NAME})
+    set(${CPM_ARGS_NAME}_ADDED NO)
+    set(CPM_PACKAGE_ALREADY_ADDED YES PARENT_SCOPE)
+    cpm_export_variables(${CPM_ARGS_NAME})
+  else()
+    set(CPM_PACKAGE_ALREADY_ADDED NO PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Parse the argument of CPMAddPackage in case a single one was provided and convert it to a list of
+# arguments which can then be parsed idiomatically. For example gh:foo/bar@1.2.3 will be converted
+# to: GITHUB_REPOSITORY;foo/bar;VERSION;1.2.3
+function(cpm_parse_add_package_single_arg arg outArgs)
+  # Look for a scheme
+  if ("${arg}" MATCHES "^([a-zA-Z]+):(.+)$")
+    string(TOLOWER "${CMAKE_MATCH_1}" scheme)
+    set(uri "${CMAKE_MATCH_2}")
+
+    # Check for CPM-specific schemes
+    if (scheme STREQUAL "gh")
+      set(out "GITHUB_REPOSITORY;${uri}")
+      set(packageType "git")
+    elseif (scheme STREQUAL "gl")
+      set(out "GITLAB_REPOSITORY;${uri}")
+      set(packageType "git")
+    elseif (scheme STREQUAL "bb")
+      set(out "BITBUCKET_REPOSITORY;${uri}")
+      set(packageType "git")
+      # A CPM-specific scheme was not found. Looks like this is a generic URL so try to determine
+      # type
+    elseif (arg MATCHES ".git/?(@|#|$)")
+      set(out "GIT_REPOSITORY;${arg}")
+      set(packageType "git")
+    else()
+      # Fall back to a URL
+      set(out "URL;${arg}")
+      set(packageType "archive")
+
+      # We could also check for SVN since FetchContent supports it, but SVN is so rare these days.
+      # We just won't bother with the additional complexity it will induce in this function. SVN is
+      # done by multi-arg
+    endif()
+  else()
+    if (arg MATCHES ".git/?(@|#|$)")
+      set(out "GIT_REPOSITORY;${arg}")
+      set(packageType "git")
+    else()
+      # Give up
+      message(
+        FATAL_ERROR
+        "${CPM_INDENT} Can't determine package type of '${arg}'"
+      )
+    endif()
+  endif()
+
+  # For all packages we interpret @... as version. Only replace the last occurrence. Thus URIs
+  # containing '@' can be used
+  string(REGEX REPLACE "@([^@]+)$" ";VERSION;\\1" out "${out}")
+
+  # Parse the rest according to package type
+  if (packageType STREQUAL "git")
+    # For git repos we interpret #... as a tag or branch or commit hash
+    string(REGEX REPLACE "#([^#]+)$" ";GIT_TAG;\\1" out "${out}")
+  elseif (packageType STREQUAL "archive")
+    # For archives we interpret #... as a URL hash.
+    string(REGEX REPLACE "#([^#]+)$" ";URL_HASH;\\1" out "${out}")
+    # We don't try to parse the version if it's not provided explicitly. cpm_get_version_from_url
+    # should do this at a later point
+  else()
+    # We should never get here. This is an assertion and hitting it means there's a problem with the
+    # code above. A packageType was set, but not handled by this if-else.
+    message(
+      FATAL_ERROR
+      "${CPM_INDENT} Unsupported package type '${packageType}' of '${arg}'"
+    )
+  endif()
+
+  set(${outArgs} ${out} PARENT_SCOPE)
+endfunction()
+
+# Check that the working directory for a git repo is clean
+function(cpm_check_git_working_dir_is_clean repoPath gitTag isClean)
+  find_package(Git REQUIRED)
+
+  if (NOT GIT_EXECUTABLE)
+    # No git executable, assume directory is clean
+    set(${isClean} TRUE PARENT_SCOPE)
+    return()
+  endif()
+
+  # check for uncommitted changes
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} status --porcelain
+    RESULT_VARIABLE resultGitStatus
+    OUTPUT_VARIABLE repoStatus
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_QUIET
+    WORKING_DIRECTORY ${repoPath}
+  )
+  if (resultGitStatus)
+    # not supposed to happen, assume clean anyway
+    message(
+      WARNING
+      "${CPM_INDENT} Calling git status on folder ${repoPath} failed"
+    )
+    set(${isClean} TRUE PARENT_SCOPE)
+    return()
+  endif()
+
+  if (NOT "${repoStatus}" STREQUAL "")
+    set(${isClean} FALSE PARENT_SCOPE)
+    return()
+  endif()
+
+  # check for committed changes
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} diff -s --exit-code ${gitTag}
+    RESULT_VARIABLE resultGitDiff
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_QUIET
+    WORKING_DIRECTORY ${repoPath}
+  )
+
+  if (${resultGitDiff} EQUAL 0)
+    set(${isClean} TRUE PARENT_SCOPE)
+  else()
+    set(${isClean} FALSE PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Add PATCH_COMMAND to CPM_ARGS_UNPARSED_ARGUMENTS. This method consumes a list of files in ARGN
+# then generates a `PATCH_COMMAND` appropriate for `ExternalProject_Add()`. This command is appended
+# to the parent scope's `CPM_ARGS_UNPARSED_ARGUMENTS`.
+function(cpm_add_patches)
+  # Return if no patch files are supplied.
+  if (NOT ARGN)
+    return()
+  endif()
+
+  # Find the patch program.
+  find_program(PATCH_EXECUTABLE patch)
+  if (WIN32 AND NOT PATCH_EXECUTABLE)
+    # The Windows git executable is distributed with patch.exe. Find the path to the executable, if
+    # it exists, then search `../usr/bin` and `../../usr/bin` for patch.exe.
+    find_package(Git QUIET)
+    if (GIT_EXECUTABLE)
+      get_filename_component(extra_search_path ${GIT_EXECUTABLE} DIRECTORY)
+      get_filename_component(
+        extra_search_path_1up
+        ${extra_search_path}
+        DIRECTORY
+      )
+      get_filename_component(
+        extra_search_path_2up
+        ${extra_search_path_1up}
+        DIRECTORY
+      )
+      find_program(
+        PATCH_EXECUTABLE
+        patch
+        HINTS
+          "${extra_search_path_1up}/usr/bin"
+          "${extra_search_path_2up}/usr/bin"
+      )
+    endif()
+  endif()
+  if (NOT PATCH_EXECUTABLE)
+    message(
+      FATAL_ERROR
+      "Couldn't find `patch` executable to use with PATCHES keyword."
+    )
+  endif()
+
+  # Create a temporary
+  set(temp_list ${CPM_ARGS_UNPARSED_ARGUMENTS})
+
+  # Ensure each file exists (or error out) and add it to the list.
+  set(first_item True)
+  foreach (PATCH_FILE ${ARGN})
+    # Make sure the patch file exists, if we can't find it, try again in the current directory.
+    if (NOT EXISTS "${PATCH_FILE}")
+      if (NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/${PATCH_FILE}")
+        message(FATAL_ERROR "Couldn't find patch file: '${PATCH_FILE}'")
+      endif()
+      set(PATCH_FILE "${CMAKE_CURRENT_LIST_DIR}/${PATCH_FILE}")
+    endif()
+
+    # Convert to absolute path for use with patch file command.
+    get_filename_component(PATCH_FILE "${PATCH_FILE}" ABSOLUTE)
+
+    # The first patch entry must be preceded by "PATCH_COMMAND" while the following items are
+    # preceded by "&&".
+    if (first_item)
+      set(first_item False)
+      list(APPEND temp_list "PATCH_COMMAND")
+    else()
+      list(APPEND temp_list "&&")
+    endif()
+    # Add the patch command to the list
+    list(APPEND temp_list "${PATCH_EXECUTABLE}" "-p1" "<" "${PATCH_FILE}")
+  endforeach()
+
+  # Move temp out into parent scope.
+  set(CPM_ARGS_UNPARSED_ARGUMENTS ${temp_list} PARENT_SCOPE)
+endfunction()
+
+# method to overwrite internal FetchContent properties, to allow using CPM.cmake to overload
+# FetchContent calls. As these are internal cmake properties, this method should be used carefully
+# and may need modification in future CMake versions. Source:
+# https://github.com/Kitware/CMake/blob/dc3d0b5a0a7d26d43d6cfeb511e224533b5d188f/Modules/FetchContent.cmake#L1152
+function(cpm_override_fetchcontent contentName)
+  cmake_parse_arguments(PARSE_ARGV 1 arg "" "SOURCE_DIR;BINARY_DIR" "")
+  if (NOT "${arg_UNPARSED_ARGUMENTS}" STREQUAL "")
+    message(
+      FATAL_ERROR
+      "${CPM_INDENT} Unsupported arguments: ${arg_UNPARSED_ARGUMENTS}"
+    )
+  endif()
+
+  string(TOLOWER ${contentName} contentNameLower)
+  set(prefix "_FetchContent_${contentNameLower}")
+
+  set(propertyName "${prefix}_sourceDir")
+  define_property(
+    GLOBAL
+    PROPERTY ${propertyName}
+    BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()"
+    FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}"
+  )
+  set_property(GLOBAL PROPERTY ${propertyName} "${arg_SOURCE_DIR}")
+
+  set(propertyName "${prefix}_binaryDir")
+  define_property(
+    GLOBAL
+    PROPERTY ${propertyName}
+    BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()"
+    FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}"
+  )
+  set_property(GLOBAL PROPERTY ${propertyName} "${arg_BINARY_DIR}")
+
+  set(propertyName "${prefix}_populated")
+  define_property(
+    GLOBAL
+    PROPERTY ${propertyName}
+    BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()"
+    FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}"
+  )
+  set_property(GLOBAL PROPERTY ${propertyName} TRUE)
+endfunction()
+
+# Download and add a package from source
+function(CPMAddPackage)
+  cpm_set_policies()
+
+  list(LENGTH ARGN argnLength)
+  if (argnLength EQUAL 1)
+    cpm_parse_add_package_single_arg("${ARGN}" ARGN)
+
+    # The shorthand syntax implies EXCLUDE_FROM_ALL and SYSTEM
+    set(ARGN "${ARGN};EXCLUDE_FROM_ALL;YES;SYSTEM;YES;")
+  endif()
+
+  set(
+    oneValueArgs
+    NAME
+    FORCE
+    VERSION
+    GIT_TAG
+    DOWNLOAD_ONLY
+    GITHUB_REPOSITORY
+    GITLAB_REPOSITORY
+    BITBUCKET_REPOSITORY
+    GIT_REPOSITORY
+    SOURCE_DIR
+    FIND_PACKAGE_ARGUMENTS
+    NO_CACHE
+    SYSTEM
+    GIT_SHALLOW
+    EXCLUDE_FROM_ALL
+    SOURCE_SUBDIR
+    CUSTOM_CACHE_KEY
+  )
+
+  set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND PATCHES)
+
+  cmake_parse_arguments(
+    CPM_ARGS
+    ""
+    "${oneValueArgs}"
+    "${multiValueArgs}"
+    "${ARGN}"
+  )
+
+  # Set default values for arguments
+
+  if (NOT DEFINED CPM_ARGS_VERSION)
+    if (DEFINED CPM_ARGS_GIT_TAG)
+      cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION)
+    endif()
+  endif()
+
+  if (CPM_ARGS_DOWNLOAD_ONLY)
+    set(DOWNLOAD_ONLY ${CPM_ARGS_DOWNLOAD_ONLY})
+  else()
+    set(DOWNLOAD_ONLY NO)
+  endif()
+
+  if (DEFINED CPM_ARGS_GITHUB_REPOSITORY)
+    set(
+      CPM_ARGS_GIT_REPOSITORY
+      "https://github.com/${CPM_ARGS_GITHUB_REPOSITORY}.git"
+    )
+  elseif (DEFINED CPM_ARGS_GITLAB_REPOSITORY)
+    set(
+      CPM_ARGS_GIT_REPOSITORY
+      "https://gitlab.com/${CPM_ARGS_GITLAB_REPOSITORY}.git"
+    )
+  elseif (DEFINED CPM_ARGS_BITBUCKET_REPOSITORY)
+    set(
+      CPM_ARGS_GIT_REPOSITORY
+      "https://bitbucket.org/${CPM_ARGS_BITBUCKET_REPOSITORY}.git"
+    )
+  endif()
+
+  if (DEFINED CPM_ARGS_GIT_REPOSITORY)
+    list(
+      APPEND CPM_ARGS_UNPARSED_ARGUMENTS
+      GIT_REPOSITORY
+      ${CPM_ARGS_GIT_REPOSITORY}
+    )
+    if (NOT DEFINED CPM_ARGS_GIT_TAG)
+      set(CPM_ARGS_GIT_TAG v${CPM_ARGS_VERSION})
+    endif()
+
+    # If a name wasn't provided, try to infer it from the git repo
+    if (NOT DEFINED CPM_ARGS_NAME)
+      cpm_package_name_from_git_uri(${CPM_ARGS_GIT_REPOSITORY} CPM_ARGS_NAME)
+    endif()
+  endif()
+
+  set(CPM_SKIP_FETCH FALSE)
+
+  if (DEFINED CPM_ARGS_GIT_TAG)
+    list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_TAG ${CPM_ARGS_GIT_TAG})
+    # If GIT_SHALLOW is explicitly specified, honor the value.
+    if (DEFINED CPM_ARGS_GIT_SHALLOW)
+      list(
+        APPEND CPM_ARGS_UNPARSED_ARGUMENTS
+        GIT_SHALLOW
+        ${CPM_ARGS_GIT_SHALLOW}
+      )
+    endif()
+  endif()
+
+  if (DEFINED CPM_ARGS_URL)
+    # If a name or version aren't provided, try to infer them from the URL
+    list(GET CPM_ARGS_URL 0 firstUrl)
+    cpm_package_name_and_ver_from_url(${firstUrl} nameFromUrl verFromUrl)
+    # If we fail to obtain name and version from the first URL, we could try other URLs if any.
+    # However multiple URLs are expected to be quite rare, so for now we won't bother.
+
+    # If the caller provided their own name and version, they trump the inferred ones.
+    if (NOT DEFINED CPM_ARGS_NAME)
+      set(CPM_ARGS_NAME ${nameFromUrl})
+    endif()
+    if (NOT DEFINED CPM_ARGS_VERSION)
+      set(CPM_ARGS_VERSION ${verFromUrl})
+    endif()
+
+    list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS URL "${CPM_ARGS_URL}")
+  endif()
+
+  # Check for required arguments
+
+  if (NOT DEFINED CPM_ARGS_NAME)
+    message(
+      FATAL_ERROR
+      "${CPM_INDENT} 'NAME' was not provided and couldn't be automatically inferred for package added with arguments: '${ARGN}'"
+    )
+  endif()
+
+  # Check if package has been added before
+  cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}")
+  if (CPM_PACKAGE_ALREADY_ADDED)
+    cpm_export_variables(${CPM_ARGS_NAME})
+    return()
+  endif()
+
+  # Check for manual overrides
+  if (NOT CPM_ARGS_FORCE AND NOT "${CPM_${CPM_ARGS_NAME}_SOURCE}" STREQUAL "")
+    set(PACKAGE_SOURCE ${CPM_${CPM_ARGS_NAME}_SOURCE})
+    set(CPM_${CPM_ARGS_NAME}_SOURCE "")
+    CPMAddPackage(
+      NAME "${CPM_ARGS_NAME}"
+      SOURCE_DIR "${PACKAGE_SOURCE}"
+      EXCLUDE_FROM_ALL "${CPM_ARGS_EXCLUDE_FROM_ALL}"
+      SYSTEM "${CPM_ARGS_SYSTEM}"
+      PATCHES "${CPM_ARGS_PATCHES}"
+      OPTIONS "${CPM_ARGS_OPTIONS}"
+      SOURCE_SUBDIR "${CPM_ARGS_SOURCE_SUBDIR}"
+      DOWNLOAD_ONLY "${DOWNLOAD_ONLY}"
+      FORCE True
+    )
+    cpm_export_variables(${CPM_ARGS_NAME})
+    return()
+  endif()
+
+  # Check for available declaration
+  if (
+    NOT CPM_ARGS_FORCE
+    AND NOT "${CPM_DECLARATION_${CPM_ARGS_NAME}}" STREQUAL ""
+  )
+    set(declaration ${CPM_DECLARATION_${CPM_ARGS_NAME}})
+    set(CPM_DECLARATION_${CPM_ARGS_NAME} "")
+    CPMAddPackage(${declaration})
+    cpm_export_variables(${CPM_ARGS_NAME})
+    # checking again to ensure version and option compatibility
+    cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}")
+    return()
+  endif()
+
+  if (NOT CPM_ARGS_FORCE)
+    if (CPM_USE_LOCAL_PACKAGES OR CPM_LOCAL_PACKAGES_ONLY)
+      cpm_find_package(
+        ${CPM_ARGS_NAME}
+        "${CPM_ARGS_VERSION}"
+        ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS}
+      )
+
+      if (CPM_PACKAGE_FOUND)
+        cpm_export_variables(${CPM_ARGS_NAME})
+        return()
+      endif()
+
+      if (CPM_LOCAL_PACKAGES_ONLY)
+        message(
+          SEND_ERROR
+          "${CPM_INDENT} ${CPM_ARGS_NAME} not found via find_package(${CPM_ARGS_NAME} ${CPM_ARGS_VERSION})"
+        )
+      endif()
+    endif()
+  endif()
+
+  CPMRegisterPackage("${CPM_ARGS_NAME}" "${CPM_ARGS_VERSION}")
+
+  if (DEFINED CPM_ARGS_GIT_TAG)
+    set(PACKAGE_INFO "${CPM_ARGS_GIT_TAG}")
+  elseif (DEFINED CPM_ARGS_SOURCE_DIR)
+    set(PACKAGE_INFO "${CPM_ARGS_SOURCE_DIR}")
+  else()
+    set(PACKAGE_INFO "${CPM_ARGS_VERSION}")
+  endif()
+
+  if (DEFINED FETCHCONTENT_BASE_DIR)
+    # respect user's FETCHCONTENT_BASE_DIR if set
+    set(CPM_FETCHCONTENT_BASE_DIR ${FETCHCONTENT_BASE_DIR})
+  else()
+    set(CPM_FETCHCONTENT_BASE_DIR ${CMAKE_BINARY_DIR}/_deps)
+  endif()
+
+  cpm_add_patches(${CPM_ARGS_PATCHES})
+
+  if (DEFINED CPM_ARGS_DOWNLOAD_COMMAND)
+    list(
+      APPEND CPM_ARGS_UNPARSED_ARGUMENTS
+      DOWNLOAD_COMMAND
+      ${CPM_ARGS_DOWNLOAD_COMMAND}
+    )
+  elseif (DEFINED CPM_ARGS_SOURCE_DIR)
+    list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${CPM_ARGS_SOURCE_DIR})
+    if (NOT IS_ABSOLUTE ${CPM_ARGS_SOURCE_DIR})
+      # Expand `CPM_ARGS_SOURCE_DIR` relative path. This is important because EXISTS doesn't work
+      # for relative paths.
+      get_filename_component(
+        source_directory
+        ${CPM_ARGS_SOURCE_DIR}
+        REALPATH
+        BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}
+      )
+    else()
+      set(source_directory ${CPM_ARGS_SOURCE_DIR})
+    endif()
+    if (NOT EXISTS ${source_directory})
+      string(TOLOWER ${CPM_ARGS_NAME} lower_case_name)
+      # remove timestamps so CMake will re-download the dependency
+      file(
+        REMOVE_RECURSE
+        "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild"
+      )
+    endif()
+  elseif (CPM_SOURCE_CACHE AND NOT CPM_ARGS_NO_CACHE)
+    string(TOLOWER ${CPM_ARGS_NAME} lower_case_name)
+    set(origin_parameters ${CPM_ARGS_UNPARSED_ARGUMENTS})
+    list(SORT origin_parameters)
+    if (CPM_ARGS_CUSTOM_CACHE_KEY)
+      # Application set a custom unique directory name
+      set(
+        download_directory
+        ${CPM_SOURCE_CACHE}/${lower_case_name}/${CPM_ARGS_CUSTOM_CACHE_KEY}
+      )
+    elseif (CPM_USE_NAMED_CACHE_DIRECTORIES)
+      string(SHA1 origin_hash "${origin_parameters};NEW_CACHE_STRUCTURE_TAG")
+      set(
+        download_directory
+        ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}/${CPM_ARGS_NAME}
+      )
+    else()
+      string(SHA1 origin_hash "${origin_parameters}")
+      set(
+        download_directory
+        ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}
+      )
+    endif()
+    # Expand `download_directory` relative path. This is important because EXISTS doesn't work for
+    # relative paths.
+    get_filename_component(download_directory ${download_directory} ABSOLUTE)
+    list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${download_directory})
+
+    if (CPM_SOURCE_CACHE)
+      file(LOCK ${download_directory}/../cmake.lock)
+    endif()
+
+    if (EXISTS ${download_directory})
+      if (CPM_SOURCE_CACHE)
+        file(LOCK ${download_directory}/../cmake.lock RELEASE)
+      endif()
+
+      cpm_store_fetch_properties(
+        ${CPM_ARGS_NAME}
+        "${download_directory}"
+        "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build"
+      )
+      cpm_get_fetch_properties("${CPM_ARGS_NAME}")
+
+      if (
+        DEFINED CPM_ARGS_GIT_TAG
+        AND NOT (PATCH_COMMAND IN_LIST CPM_ARGS_UNPARSED_ARGUMENTS)
+      )
+        # warn if cache has been changed since checkout
+        cpm_check_git_working_dir_is_clean(
+          ${download_directory}
+          ${CPM_ARGS_GIT_TAG}
+          IS_CLEAN
+        )
+        if (NOT ${IS_CLEAN})
+          message(
+            WARNING
+            "${CPM_INDENT} Cache for ${CPM_ARGS_NAME} (${download_directory}) is dirty"
+          )
+        endif()
+      endif()
+
+      cpm_add_subdirectory(
+        "${CPM_ARGS_NAME}"
+        "${DOWNLOAD_ONLY}"
+        "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}"
+        "${${CPM_ARGS_NAME}_BINARY_DIR}"
+        "${CPM_ARGS_EXCLUDE_FROM_ALL}"
+        "${CPM_ARGS_SYSTEM}"
+        "${CPM_ARGS_OPTIONS}"
+      )
+      set(PACKAGE_INFO "${PACKAGE_INFO} at ${download_directory}")
+
+      # As the source dir is already cached/populated, we override the call to FetchContent.
+      set(CPM_SKIP_FETCH TRUE)
+      cpm_override_fetchcontent(
+        "${lower_case_name}"
+        SOURCE_DIR "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}"
+        BINARY_DIR "${${CPM_ARGS_NAME}_BINARY_DIR}"
+      )
+    else()
+      # Enable shallow clone when GIT_TAG is not a commit hash. Our guess may not be accurate, but
+      # it should guarantee no commit hash get mis-detected.
+      if (NOT DEFINED CPM_ARGS_GIT_SHALLOW)
+        cpm_is_git_tag_commit_hash("${CPM_ARGS_GIT_TAG}" IS_HASH)
+        if (NOT ${IS_HASH})
+          list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW TRUE)
+        endif()
+      endif()
+
+      # remove timestamps so CMake will re-download the dependency
+      file(
+        REMOVE_RECURSE
+        ${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild
+      )
+      set(PACKAGE_INFO "${PACKAGE_INFO} to ${download_directory}")
+    endif()
+  endif()
+
+  cpm_create_module_file(${CPM_ARGS_NAME} "CPMAddPackage(\"${ARGN}\")")
+
+  if (CPM_PACKAGE_LOCK_ENABLED)
+    if (
+      (CPM_ARGS_VERSION AND NOT CPM_ARGS_SOURCE_DIR)
+      OR CPM_INCLUDE_ALL_IN_PACKAGE_LOCK
+    )
+      cpm_add_to_package_lock(${CPM_ARGS_NAME} "${ARGN}")
+    elseif (CPM_ARGS_SOURCE_DIR)
+      cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "local directory")
+    else()
+      cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "${ARGN}")
+    endif()
+  endif()
+
+  cpm_message(
+    STATUS
+    "${CPM_INDENT} Adding package ${CPM_ARGS_NAME}@${CPM_ARGS_VERSION} (${PACKAGE_INFO})"
+  )
+
+  if (NOT CPM_SKIP_FETCH)
+    # CMake 3.28 added EXCLUDE, SYSTEM (3.25), and SOURCE_SUBDIR (3.18) to FetchContent_Declare.
+    # Calling FetchContent_MakeAvailable will then internally forward these options to
+    # add_subdirectory. Up until these changes, we had to call FetchContent_Populate and
+    # add_subdirectory separately, which is no longer necessary and has been deprecated as of 3.30.
+    set(fetchContentDeclareExtraArgs "")
+    if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.28.0")
+      if (${CPM_ARGS_EXCLUDE_FROM_ALL})
+        list(APPEND fetchContentDeclareExtraArgs EXCLUDE_FROM_ALL)
+      endif()
+      if (${CPM_ARGS_SYSTEM})
+        list(APPEND fetchContentDeclareExtraArgs SYSTEM)
+      endif()
+      if (DEFINED CPM_ARGS_SOURCE_SUBDIR)
+        list(
+          APPEND fetchContentDeclareExtraArgs
+          SOURCE_SUBDIR
+          ${CPM_ARGS_SOURCE_SUBDIR}
+        )
+      endif()
+      # For CMake version <3.28 OPTIONS are parsed in cpm_add_subdirectory
+      if (CPM_ARGS_OPTIONS AND NOT DOWNLOAD_ONLY)
+        foreach (OPTION ${CPM_ARGS_OPTIONS})
+          cpm_parse_option("${OPTION}")
+          set(${OPTION_KEY} "${OPTION_VALUE}")
+        endforeach()
+      endif()
+    endif()
+    cpm_declare_fetch(
+      "${CPM_ARGS_NAME}"
+      ${fetchContentDeclareExtraArgs}
+      "${CPM_ARGS_UNPARSED_ARGUMENTS}"
+    )
+
+    cpm_fetch_package(
+      "${CPM_ARGS_NAME}"
+      ${DOWNLOAD_ONLY}
+      populated
+      ${CPM_ARGS_UNPARSED_ARGUMENTS}
+    )
+    if (CPM_SOURCE_CACHE AND download_directory)
+      file(LOCK ${download_directory}/../cmake.lock RELEASE)
+    endif()
+    if (${populated} AND ${CMAKE_VERSION} VERSION_LESS "3.28.0")
+      cpm_add_subdirectory(
+        "${CPM_ARGS_NAME}"
+        "${DOWNLOAD_ONLY}"
+        "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}"
+        "${${CPM_ARGS_NAME}_BINARY_DIR}"
+        "${CPM_ARGS_EXCLUDE_FROM_ALL}"
+        "${CPM_ARGS_SYSTEM}"
+        "${CPM_ARGS_OPTIONS}"
+      )
+    endif()
+    cpm_get_fetch_properties("${CPM_ARGS_NAME}")
+  endif()
+
+  set(${CPM_ARGS_NAME}_ADDED YES)
+  cpm_export_variables("${CPM_ARGS_NAME}")
+endfunction()
+
+# Fetch a previously declared package
+macro(CPMGetPackage Name)
+  if (DEFINED "CPM_DECLARATION_${Name}")
+    CPMAddPackage(NAME ${Name})
+  else()
+    message(
+      SEND_ERROR
+      "${CPM_INDENT} Cannot retrieve package ${Name}: no declaration available"
+    )
+  endif()
+endmacro()
+
+# export variables available to the caller to the parent scope expects ${CPM_ARGS_NAME} to be set
+macro(cpm_export_variables name)
+  set(${name}_SOURCE_DIR "${${name}_SOURCE_DIR}" PARENT_SCOPE)
+  set(${name}_BINARY_DIR "${${name}_BINARY_DIR}" PARENT_SCOPE)
+  set(${name}_ADDED "${${name}_ADDED}" PARENT_SCOPE)
+  set(CPM_LAST_PACKAGE_NAME "${name}" PARENT_SCOPE)
+endmacro()
+
+# declares a package, so that any call to CPMAddPackage for the package name will use these
+# arguments instead. Previous declarations will not be overridden.
+macro(CPMDeclarePackage Name)
+  if (NOT DEFINED "CPM_DECLARATION_${Name}")
+    set("CPM_DECLARATION_${Name}" "${ARGN}")
+  endif()
+endmacro()
+
+function(cpm_add_to_package_lock Name)
+  if (NOT CPM_DONT_CREATE_PACKAGE_LOCK)
+    cpm_prettify_package_arguments(PRETTY_ARGN false ${ARGN})
+    file(
+      APPEND ${CPM_PACKAGE_LOCK_FILE}
+      "# ${Name}\nCPMDeclarePackage(${Name}\n${PRETTY_ARGN})\n"
+    )
+  endif()
+endfunction()
+
+function(cpm_add_comment_to_package_lock Name)
+  if (NOT CPM_DONT_CREATE_PACKAGE_LOCK)
+    cpm_prettify_package_arguments(PRETTY_ARGN true ${ARGN})
+    file(
+      APPEND ${CPM_PACKAGE_LOCK_FILE}
+      "# ${Name} (unversioned)\n# CPMDeclarePackage(${Name}\n${PRETTY_ARGN}#)\n"
+    )
+  endif()
+endfunction()
+
+# includes the package lock file if it exists and creates a target `cpm-update-package-lock` to
+# update it
+macro(CPMUsePackageLock file)
+  if (NOT CPM_DONT_CREATE_PACKAGE_LOCK)
+    get_filename_component(CPM_ABSOLUTE_PACKAGE_LOCK_PATH ${file} ABSOLUTE)
+    if (EXISTS ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH})
+      include(${CPM_ABSOLUTE_PACKAGE_LOCK_PATH})
+    endif()
+    if (NOT TARGET cpm-update-package-lock)
+      add_custom_target(
+        cpm-update-package-lock
+        COMMAND
+          ${CMAKE_COMMAND} -E copy ${CPM_PACKAGE_LOCK_FILE}
+          ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH}
+      )
+    endif()
+    set(CPM_PACKAGE_LOCK_ENABLED true)
+  endif()
+endmacro()
+
+# registers a package that has been added to CPM
+function(CPMRegisterPackage PACKAGE VERSION)
+  list(APPEND CPM_PACKAGES ${PACKAGE})
+  set(CPM_PACKAGES ${CPM_PACKAGES} CACHE INTERNAL "")
+  set("CPM_PACKAGE_${PACKAGE}_VERSION" ${VERSION} CACHE INTERNAL "")
+endfunction()
+
+# retrieve the current version of the package to ${OUTPUT}
+function(CPMGetPackageVersion PACKAGE OUTPUT)
+  set(${OUTPUT} "${CPM_PACKAGE_${PACKAGE}_VERSION}" PARENT_SCOPE)
+endfunction()
+
+# declares a package in FetchContent_Declare
+function(cpm_declare_fetch PACKAGE)
+  if (${CPM_DRY_RUN})
+    cpm_message(STATUS "${CPM_INDENT} Package not declared (dry run)")
+    return()
+  endif()
+
+  FetchContent_Declare(${PACKAGE} ${ARGN})
+endfunction()
+
+# returns properties for a package previously defined by cpm_declare_fetch
+function(cpm_get_fetch_properties PACKAGE)
+  if (${CPM_DRY_RUN})
+    return()
+  endif()
+
+  set(${PACKAGE}_SOURCE_DIR "${CPM_PACKAGE_${PACKAGE}_SOURCE_DIR}" PARENT_SCOPE)
+  set(${PACKAGE}_BINARY_DIR "${CPM_PACKAGE_${PACKAGE}_BINARY_DIR}" PARENT_SCOPE)
+endfunction()
+
+function(cpm_store_fetch_properties PACKAGE source_dir binary_dir)
+  if (${CPM_DRY_RUN})
+    return()
+  endif()
+
+  set(CPM_PACKAGE_${PACKAGE}_SOURCE_DIR "${source_dir}" CACHE INTERNAL "")
+  set(CPM_PACKAGE_${PACKAGE}_BINARY_DIR "${binary_dir}" CACHE INTERNAL "")
+endfunction()
+
+# adds a package as a subdirectory if viable, according to provided options
+function(
+  cpm_add_subdirectory
+  PACKAGE
+  DOWNLOAD_ONLY
+  SOURCE_DIR
+  BINARY_DIR
+  EXCLUDE
+  SYSTEM
+  OPTIONS
+)
+  if (NOT DOWNLOAD_ONLY AND EXISTS ${SOURCE_DIR}/CMakeLists.txt)
+    set(addSubdirectoryExtraArgs "")
+    if (EXCLUDE)
+      list(APPEND addSubdirectoryExtraArgs EXCLUDE_FROM_ALL)
+    endif()
+    if ("${SYSTEM}" AND "${CMAKE_VERSION}" VERSION_GREATER_EQUAL "3.25")
+      # https://cmake.org/cmake/help/latest/prop_dir/SYSTEM.html#prop_dir:SYSTEM
+      list(APPEND addSubdirectoryExtraArgs SYSTEM)
+    endif()
+    if (OPTIONS)
+      foreach (OPTION ${OPTIONS})
+        cpm_parse_option("${OPTION}")
+        set(${OPTION_KEY} "${OPTION_VALUE}")
+      endforeach()
+    endif()
+    set(CPM_OLD_INDENT "${CPM_INDENT}")
+    set(CPM_INDENT "${CPM_INDENT} ${PACKAGE}:")
+    add_subdirectory(${SOURCE_DIR} ${BINARY_DIR} ${addSubdirectoryExtraArgs})
+    set(CPM_INDENT "${CPM_OLD_INDENT}")
+  endif()
+endfunction()
+
+# downloads a previously declared package via FetchContent and exports the variables
+# `${PACKAGE}_SOURCE_DIR` and `${PACKAGE}_BINARY_DIR` to the parent scope
+function(cpm_fetch_package PACKAGE DOWNLOAD_ONLY populated)
+  set(${populated} FALSE PARENT_SCOPE)
+  if (${CPM_DRY_RUN})
+    cpm_message(STATUS "${CPM_INDENT} Package ${PACKAGE} not fetched (dry run)")
+    return()
+  endif()
+
+  FetchContent_GetProperties(${PACKAGE})
+
+  string(TOLOWER "${PACKAGE}" lower_case_name)
+
+  if (NOT ${lower_case_name}_POPULATED)
+    if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.28.0")
+      if (DOWNLOAD_ONLY)
+        # MakeAvailable will call add_subdirectory internally which is not what we want when
+        # DOWNLOAD_ONLY is set. Populate will only download the dependency without adding it to the
+        # build
+        FetchContent_Populate(
+          ${PACKAGE}
+          SOURCE_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-src"
+          BINARY_DIR "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build"
+          SUBBUILD_DIR
+            "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild"
+          ${ARGN}
+        )
+      else()
+        FetchContent_MakeAvailable(${PACKAGE})
+      endif()
+    else()
+      FetchContent_Populate(${PACKAGE})
+    endif()
+    set(${populated} TRUE PARENT_SCOPE)
+  endif()
+
+  cpm_store_fetch_properties(
+    ${CPM_ARGS_NAME}
+    ${${lower_case_name}_SOURCE_DIR}
+    ${${lower_case_name}_BINARY_DIR}
+  )
+
+  set(${PACKAGE}_SOURCE_DIR ${${lower_case_name}_SOURCE_DIR} PARENT_SCOPE)
+  set(${PACKAGE}_BINARY_DIR ${${lower_case_name}_BINARY_DIR} PARENT_SCOPE)
+endfunction()
+
+# splits a package option
+function(cpm_parse_option OPTION)
+  string(REGEX MATCH "^[^ ]+" OPTION_KEY "${OPTION}")
+  string(LENGTH "${OPTION}" OPTION_LENGTH)
+  string(LENGTH "${OPTION_KEY}" OPTION_KEY_LENGTH)
+  if (OPTION_KEY_LENGTH STREQUAL OPTION_LENGTH)
+    # no value for key provided, assume user wants to set option to "ON"
+    set(OPTION_VALUE "ON")
+  else()
+    math(EXPR OPTION_KEY_LENGTH "${OPTION_KEY_LENGTH}+1")
+    string(SUBSTRING "${OPTION}" "${OPTION_KEY_LENGTH}" "-1" OPTION_VALUE)
+  endif()
+  set(OPTION_KEY "${OPTION_KEY}" PARENT_SCOPE)
+  set(OPTION_VALUE "${OPTION_VALUE}" PARENT_SCOPE)
+endfunction()
+
+# guesses the package version from a git tag
+function(cpm_get_version_from_git_tag GIT_TAG RESULT)
+  string(LENGTH ${GIT_TAG} length)
+  if (length EQUAL 40)
+    # GIT_TAG is probably a git hash
+    set(${RESULT} 0 PARENT_SCOPE)
+  else()
+    string(REGEX MATCH "v?([0123456789.]*).*" _ ${GIT_TAG})
+    set(${RESULT} ${CMAKE_MATCH_1} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# guesses if the git tag is a commit hash or an actual tag or a branch name.
+function(cpm_is_git_tag_commit_hash GIT_TAG RESULT)
+  string(LENGTH "${GIT_TAG}" length)
+  # full hash has 40 characters, and short hash has at least 7 characters.
+  if (length LESS 7 OR length GREATER 40)
+    set(${RESULT} 0 PARENT_SCOPE)
+  else()
+    if (${GIT_TAG} MATCHES "^[a-fA-F0-9]+$")
+      set(${RESULT} 1 PARENT_SCOPE)
+    else()
+      set(${RESULT} 0 PARENT_SCOPE)
+    endif()
+  endif()
+endfunction()
+
+function(cpm_prettify_package_arguments OUT_VAR IS_IN_COMMENT)
+  set(
+    oneValueArgs
+    NAME
+    FORCE
+    VERSION
+    GIT_TAG
+    DOWNLOAD_ONLY
+    GITHUB_REPOSITORY
+    GITLAB_REPOSITORY
+    BITBUCKET_REPOSITORY
+    GIT_REPOSITORY
+    SOURCE_DIR
+    FIND_PACKAGE_ARGUMENTS
+    NO_CACHE
+    SYSTEM
+    GIT_SHALLOW
+    EXCLUDE_FROM_ALL
+    SOURCE_SUBDIR
+  )
+  set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND)
+  cmake_parse_arguments(
+    CPM_ARGS
+    ""
+    "${oneValueArgs}"
+    "${multiValueArgs}"
+    ${ARGN}
+  )
+
+  foreach (oneArgName ${oneValueArgs})
+    if (DEFINED CPM_ARGS_${oneArgName})
+      if (${IS_IN_COMMENT})
+        string(APPEND PRETTY_OUT_VAR "#")
+      endif()
+      if (${oneArgName} STREQUAL "SOURCE_DIR")
+        string(
+          REPLACE
+          ${CMAKE_SOURCE_DIR}
+          "\${CMAKE_SOURCE_DIR}"
+          CPM_ARGS_${oneArgName}
+          ${CPM_ARGS_${oneArgName}}
+        )
+      endif()
+      string(
+        APPEND PRETTY_OUT_VAR
+        "  ${oneArgName} ${CPM_ARGS_${oneArgName}}\n"
+      )
+    endif()
+  endforeach()
+  foreach (multiArgName ${multiValueArgs})
+    if (DEFINED CPM_ARGS_${multiArgName})
+      if (${IS_IN_COMMENT})
+        string(APPEND PRETTY_OUT_VAR "#")
+      endif()
+      string(APPEND PRETTY_OUT_VAR "  ${multiArgName}\n")
+      foreach (singleOption ${CPM_ARGS_${multiArgName}})
+        if (${IS_IN_COMMENT})
+          string(APPEND PRETTY_OUT_VAR "#")
+        endif()
+        string(APPEND PRETTY_OUT_VAR "    \"${singleOption}\"\n")
+      endforeach()
+    endif()
+  endforeach()
+
+  if (NOT "${CPM_ARGS_UNPARSED_ARGUMENTS}" STREQUAL "")
+    if (${IS_IN_COMMENT})
+      string(APPEND PRETTY_OUT_VAR "#")
+    endif()
+    string(APPEND PRETTY_OUT_VAR " ")
+    foreach (CPM_ARGS_UNPARSED_ARGUMENT ${CPM_ARGS_UNPARSED_ARGUMENTS})
+      string(APPEND PRETTY_OUT_VAR " ${CPM_ARGS_UNPARSED_ARGUMENT}")
+    endforeach()
+    string(APPEND PRETTY_OUT_VAR "\n")
+  endif()
+
+  set(${OUT_VAR} ${PRETTY_OUT_VAR} PARENT_SCOPE)
+endfunction()
diff --git a/cpp/4_CUDA_Libraries/CMakeLists.txt b/cpp/4_CUDA_Libraries/CMakeLists.txt
index e425989a..0e5dfe9d 100644
--- a/cpp/4_CUDA_Libraries/CMakeLists.txt
+++ b/cpp/4_CUDA_Libraries/CMakeLists.txt
@@ -9,6 +9,9 @@ add_subdirectory(conjugateGradientMultiBlockCG)
 add_subdirectory(conjugateGradientMultiDeviceCG)
 add_subdirectory(conjugateGradientPrecond)
 add_subdirectory(conjugateGradientUM)
+add_subdirectory(cubDeviceFind)
+add_subdirectory(cubDeviceSegmentedScan)
+add_subdirectory(cubDeviceTransform)
 add_subdirectory(cudaNvSci)
 add_subdirectory(cuSolverDn_LinearSolver)
 add_subdirectory(cuSolverRf)
@@ -18,6 +21,8 @@ add_subdirectory(cuSolverSp_LowlevelQR)
 add_subdirectory(freeImageInteropNPP)
 add_subdirectory(histEqualizationNPP)
 add_subdirectory(jitLto)
+add_subdirectory(libcuxxMdspan)
+add_subdirectory(libcuxxRandom)
 add_subdirectory(lineOfSight)
 add_subdirectory(matrixMulCUBLAS)
 add_subdirectory(nvJPEG)
diff --git a/cpp/4_CUDA_Libraries/cubDeviceFind/CMakeLists.txt b/cpp/4_CUDA_Libraries/cubDeviceFind/CMakeLists.txt
new file mode 100644
index 00000000..2937b837
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/cubDeviceFind/CMakeLists.txt
@@ -0,0 +1,62 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(cubDeviceFind LANGUAGES C CXX CUDA)
+
+# Disable response file for libraries on QNX as qcc does not support lib paths with double quotes
+if(CMAKE_SYSTEM_NAME STREQUAL "QNX")
+    set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES OFF)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Fetch CCCL via CPM.
+# Override with -DCCCL_SOURCE_DIR=/path/to/cccl to use a local checkout
+set(CCCL_SAMPLES_CCCL_TAG "v3.3.3" CACHE STRING
+    "Tag/branch of NVIDIA/cccl to fetch for the CCCL samples")
+
+if(NOT TARGET CCCL::CCCL)
+    include("${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/CPM.cmake")
+    if(DEFINED CCCL_SOURCE_DIR AND NOT CCCL_SOURCE_DIR STREQUAL "")
+        CPMAddPackage(NAME CCCL SOURCE_DIR "${CCCL_SOURCE_DIR}")
+    else()
+        CPMAddPackage(
+            NAME CCCL
+            GIT_REPOSITORY "https://github.com/NVIDIA/cccl"
+            GIT_TAG "${CCCL_SAMPLES_CCCL_TAG}"
+        )
+    endif()
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+# Add target for cubDeviceFind
+add_executable(cubDeviceFind cubDeviceFind.cu)
+
+target_compile_options(cubDeviceFind PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+
+target_compile_features(cubDeviceFind PRIVATE cxx_std_17 cuda_std_17)
+
+set_target_properties(cubDeviceFind PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+target_link_libraries(cubDeviceFind PRIVATE
+    CUDA::cudart
+    CCCL::CCCL
+)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/4_CUDA_Libraries/cubDeviceFind/README.md b/cpp/4_CUDA_Libraries/cubDeviceFind/README.md
new file mode 100644
index 00000000..3b3e6db4
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/cubDeviceFind/README.md
@@ -0,0 +1,48 @@
+# cubDeviceFind - CUB DeviceFind Search Algorithms
+
+## Description
+
+This sample demonstrates the three device-wide search algorithms: `cub::DeviceFind::FindIf` for predicate search, and `cub::DeviceFind::LowerBound` / `UpperBound` for parallel binary search. Results are verified against `std::find_if`, `std::lower_bound`, and `std::upper_bound` on the host.
+
+## Key Concepts
+
+CCCL 3.3, CUB Device Algorithms, Parallel Search, Binary Search
+
+## Supported SM Architectures
+
+[SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus) [SM 10.0 ](https://developer.nvidia.com/cuda-gpus) [SM 11.0 ](https://developer.nvidia.com/cuda-gpus) [SM 12.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, aarch64
+
+## CUDA APIs involved
+
+### [CCCL CUB](https://nvidia.github.io/cccl/unstable/cub/index.html)
+
+cub::DeviceFind::FindIf, cub::DeviceFind::LowerBound, cub::DeviceFind::UpperBound
+
+### [CCCL libcu++](https://nvidia.github.io/cccl/unstable/libcudacxx/index.html)
+
+cuda::std::less
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+
+cudaDeviceSynchronize, cudaGetDeviceProperties
+
+## Dependencies needed to build/run
+
+[CCCL 3.3+](https://github.com/NVIDIA/cccl). Fetched automatically via CPM at configure time (pinned to `v3.3.3`). Override with `-DCCCL_SOURCE_DIR=/path/to/cccl` to use a local checkout.
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## References (for more details)
+
+[CCCL 3.3 release notes](https://github.com/NVIDIA/cccl/releases), [cub::DeviceFind header](https://github.com/NVIDIA/cccl/blob/main/cub/cub/device/device_find.cuh)
diff --git a/cpp/4_CUDA_Libraries/cubDeviceFind/cubDeviceFind.cu b/cpp/4_CUDA_Libraries/cubDeviceFind/cubDeviceFind.cu
new file mode 100644
index 00000000..cf2c19e9
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/cubDeviceFind/cubDeviceFind.cu
@@ -0,0 +1,248 @@
+/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This sample demonstrates the three device-wide search algorithms
+ * introduced in CCCL 3.3: cub::DeviceFind::FindIf for predicate search,
+ * and cub::DeviceFind::LowerBound / UpperBound for parallel binary
+ * search. Results are verified against std::find_if, std::lower_bound,
+ * and std::upper_bound on the host.
+ */
+
+/* Includes, system */
+#include <algorithm>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+/* Includes, cuda */
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+
+/* Includes, cccl */
+#include <cub/device/device_find.cuh>
+#include <cuda/std/functional>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+/* Predicate used with cub::DeviceFind::FindIf. */
+struct is_greater_than_t
+{
+    int threshold;
+    __host__ __device__ bool operator()(int value) const { return value > threshold; }
+};
+
+static bool run_find_if()
+{
+    /* Input: 0, 1, ..., 15.  Predicate: value > 9.  Expected index: 10. */
+    const int                  num_items = 16;
+    thrust::device_vector<int> d_in(num_items);
+    for (int i = 0; i < num_items; ++i)
+        d_in[i] = i;
+    thrust::device_vector<int> d_out(1);
+    is_greater_than_t          predicate{9};
+
+    size_t temp_bytes = 0;
+    checkCudaErrors(
+        cub::DeviceFind::FindIf(nullptr, temp_bytes, d_in.begin(), d_out.begin(), predicate, num_items));
+    thrust::device_vector<char> temp(temp_bytes);
+    checkCudaErrors(cub::DeviceFind::FindIf(thrust::raw_pointer_cast(temp.data()),
+                                            temp_bytes,
+                                            d_in.begin(),
+                                            d_out.begin(),
+                                            predicate,
+                                            num_items));
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const int got = d_out[0];
+
+    thrust::host_vector<int> h_in       = d_in;
+    auto                     host_it    = std::find_if(h_in.begin(), h_in.end(),
+                                   [&](int v) { return v > predicate.threshold; });
+    const int                expected   = static_cast<int>(host_it - h_in.begin());
+
+    printf("cub::DeviceFind::FindIf(value > %d) over [0..%d)\n", predicate.threshold, num_items);
+    printf("  got index = %d, expected = %d  %s\n", got, expected, (got == expected ? "OK" : "FAIL"));
+    return got == expected;
+}
+
+static bool run_lower_bound()
+{
+    /* Sorted range: [0, 2, 4, 6, 8].  Values to locate: [1, 3, 5, 7]. */
+    thrust::device_vector<int> d_range  = {0, 2, 4, 6, 8};
+    thrust::device_vector<int> d_values = {1, 3, 5, 7};
+    thrust::device_vector<int> d_out(d_values.size());
+
+    size_t temp_bytes = 0;
+    checkCudaErrors(cub::DeviceFind::LowerBound(nullptr,
+                                                temp_bytes,
+                                                d_range.begin(),
+                                                static_cast<int>(d_range.size()),
+                                                d_values.begin(),
+                                                static_cast<int>(d_values.size()),
+                                                d_out.begin(),
+                                                cuda::std::less{}));
+    thrust::device_vector<char> temp(temp_bytes);
+    checkCudaErrors(cub::DeviceFind::LowerBound(thrust::raw_pointer_cast(temp.data()),
+                                                temp_bytes,
+                                                d_range.begin(),
+                                                static_cast<int>(d_range.size()),
+                                                d_values.begin(),
+                                                static_cast<int>(d_values.size()),
+                                                d_out.begin(),
+                                                cuda::std::less{}));
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    thrust::host_vector<int> h_range  = d_range;
+    thrust::host_vector<int> h_values = d_values;
+    thrust::host_vector<int> got      = d_out;
+    std::vector<int>         expected(h_values.size());
+    for (size_t i = 0; i < h_values.size(); ++i) {
+        expected[i] = static_cast<int>(
+            std::lower_bound(h_range.begin(), h_range.end(), h_values[i]) - h_range.begin());
+    }
+
+    bool ok = true;
+    printf("cub::DeviceFind::LowerBound\n");
+    printf("  range   = { 0, 2, 4, 6, 8 }\n");
+    printf("  values  = { 1, 3, 5, 7 }\n");
+    printf("  got     = {");
+    for (size_t i = 0; i < got.size(); ++i) {
+        printf(" %d", got[i]);
+        if (got[i] != expected[i])
+            ok = false;
+    }
+    printf(" }\n  expect  = {");
+    for (size_t i = 0; i < expected.size(); ++i)
+        printf(" %d", expected[i]);
+    printf(" }  %s\n", ok ? "OK" : "FAIL");
+    return ok;
+}
+
+static bool run_upper_bound()
+{
+    /* Range with duplicates so LowerBound and UpperBound differ on values
+     * that appear in the range. */
+    thrust::device_vector<int> d_range  = {0, 2, 2, 4, 6, 8};
+    thrust::device_vector<int> d_values = {2, 2};
+    thrust::device_vector<int> d_lb(d_values.size());
+    thrust::device_vector<int> d_ub(d_values.size());
+
+    size_t temp_bytes_lb = 0;
+    checkCudaErrors(cub::DeviceFind::LowerBound(nullptr,
+                                                temp_bytes_lb,
+                                                d_range.begin(),
+                                                static_cast<int>(d_range.size()),
+                                                d_values.begin(),
+                                                static_cast<int>(d_values.size()),
+                                                d_lb.begin(),
+                                                cuda::std::less{}));
+    thrust::device_vector<char> temp_lb(temp_bytes_lb);
+    checkCudaErrors(cub::DeviceFind::LowerBound(thrust::raw_pointer_cast(temp_lb.data()),
+                                                temp_bytes_lb,
+                                                d_range.begin(),
+                                                static_cast<int>(d_range.size()),
+                                                d_values.begin(),
+                                                static_cast<int>(d_values.size()),
+                                                d_lb.begin(),
+                                                cuda::std::less{}));
+
+    size_t temp_bytes_ub = 0;
+    checkCudaErrors(cub::DeviceFind::UpperBound(nullptr,
+                                                temp_bytes_ub,
+                                                d_range.begin(),
+                                                static_cast<int>(d_range.size()),
+                                                d_values.begin(),
+                                                static_cast<int>(d_values.size()),
+                                                d_ub.begin(),
+                                                cuda::std::less{}));
+    thrust::device_vector<char> temp_ub(temp_bytes_ub);
+    checkCudaErrors(cub::DeviceFind::UpperBound(thrust::raw_pointer_cast(temp_ub.data()),
+                                                temp_bytes_ub,
+                                                d_range.begin(),
+                                                static_cast<int>(d_range.size()),
+                                                d_values.begin(),
+                                                static_cast<int>(d_values.size()),
+                                                d_ub.begin(),
+                                                cuda::std::less{}));
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    thrust::host_vector<int> h_range  = d_range;
+    thrust::host_vector<int> h_values = d_values;
+    thrust::host_vector<int> got_lb   = d_lb;
+    thrust::host_vector<int> got_ub   = d_ub;
+    std::vector<int>         exp_lb(h_values.size());
+    std::vector<int>         exp_ub(h_values.size());
+    for (size_t i = 0; i < h_values.size(); ++i) {
+        exp_lb[i] =
+            static_cast<int>(std::lower_bound(h_range.begin(), h_range.end(), h_values[i]) - h_range.begin());
+        exp_ub[i] =
+            static_cast<int>(std::upper_bound(h_range.begin(), h_range.end(), h_values[i]) - h_range.begin());
+    }
+
+    bool ok = true;
+    printf("cub::DeviceFind::UpperBound (with duplicates in range)\n");
+    printf("  range   = { 0, 2, 2, 4, 6, 8 }\n");
+    printf("  values  = { 2, 2 }\n");
+    printf("  lb      = {");
+    for (size_t i = 0; i < got_lb.size(); ++i) {
+        printf(" %d", got_lb[i]);
+        if (got_lb[i] != exp_lb[i])
+            ok = false;
+    }
+    printf(" }  expected = {");
+    for (size_t i = 0; i < exp_lb.size(); ++i)
+        printf(" %d", exp_lb[i]);
+    printf(" }\n  ub      = {");
+    for (size_t i = 0; i < got_ub.size(); ++i) {
+        printf(" %d", got_ub[i]);
+        if (got_ub[i] != exp_ub[i])
+            ok = false;
+    }
+    printf(" }  expected = {");
+    for (size_t i = 0; i < exp_ub.size(); ++i)
+        printf(" %d", exp_ub[i]);
+    printf(" }  %s\n", ok ? "OK" : "FAIL");
+    return ok;
+}
+
+int main(int argc, char **argv)
+{
+    int devID = findCudaDevice(argc, (const char **)argv);
+    cudaDeviceProp props;
+    checkCudaErrors(cudaGetDeviceProperties(&props, devID));
+    printf("Device: %s (Compute Capability %d.%d)\n\n", props.name, props.major, props.minor);
+
+    bool ok = true;
+    ok &= run_find_if();
+    printf("\n");
+    ok &= run_lower_bound();
+    printf("\n");
+    ok &= run_upper_bound();
+
+    printf("\n%s\n", ok ? "Done" : "FAILED");
+    return ok ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/cpp/4_CUDA_Libraries/cubDeviceSegmentedScan/CMakeLists.txt b/cpp/4_CUDA_Libraries/cubDeviceSegmentedScan/CMakeLists.txt
new file mode 100644
index 00000000..60a402c1
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/cubDeviceSegmentedScan/CMakeLists.txt
@@ -0,0 +1,63 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(cubDeviceSegmentedScan LANGUAGES C CXX CUDA)
+
+# Disable response file for libraries on QNX as qcc does not support lib paths with double quotes
+if(CMAKE_SYSTEM_NAME STREQUAL "QNX")
+    set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES OFF)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Fetch CCCL via CPM.
+# Override with -DCCCL_SOURCE_DIR=/path/to/cccl to use a local checkout
+# instead of fetching from GitHub.
+set(CCCL_SAMPLES_CCCL_TAG "v3.3.3" CACHE STRING
+    "Tag/branch of NVIDIA/cccl to fetch for the CCCL samples")
+
+if(NOT TARGET CCCL::CCCL)
+    include("${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/CPM.cmake")
+    if(DEFINED CCCL_SOURCE_DIR AND NOT CCCL_SOURCE_DIR STREQUAL "")
+        CPMAddPackage(NAME CCCL SOURCE_DIR "${CCCL_SOURCE_DIR}")
+    else()
+        CPMAddPackage(
+            NAME CCCL
+            GIT_REPOSITORY "https://github.com/NVIDIA/cccl"
+            GIT_TAG "${CCCL_SAMPLES_CCCL_TAG}"
+        )
+    endif()
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+# Add target for cubDeviceSegmentedScan
+add_executable(cubDeviceSegmentedScan cubDeviceSegmentedScan.cu)
+
+target_compile_options(cubDeviceSegmentedScan PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+
+target_compile_features(cubDeviceSegmentedScan PRIVATE cxx_std_17 cuda_std_17)
+
+set_target_properties(cubDeviceSegmentedScan PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+target_link_libraries(cubDeviceSegmentedScan PRIVATE
+    CUDA::cudart
+    CCCL::CCCL
+)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/4_CUDA_Libraries/cubDeviceSegmentedScan/README.md b/cpp/4_CUDA_Libraries/cubDeviceSegmentedScan/README.md
new file mode 100644
index 00000000..59c8811b
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/cubDeviceSegmentedScan/README.md
@@ -0,0 +1,48 @@
+# cubDeviceSegmentedScan - CUB DeviceSegmentedScan
+
+## Description
+
+This sample demonstrates `cub::DeviceSegmentedScan`. A segmented scan computes an independent scan over each of many contiguous segments in a single device-wide call. Two operations are shown: `ExclusiveSegmentedSum` across three independent segments, and `InclusiveSegmentedScan` with a custom binary operator (running maximum via `cuda::maximum<>`).
+
+## Key Concepts
+
+CUB Device Algorithms, Segmented Scan, Prefix Sum
+
+## Supported SM Architectures
+
+[SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus) [SM 10.0 ](https://developer.nvidia.com/cuda-gpus) [SM 11.0 ](https://developer.nvidia.com/cuda-gpus) [SM 12.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, aarch64
+
+## CUDA APIs involved
+
+### [CCCL CUB](https://nvidia.github.io/cccl/cub/)
+
+cub::DeviceSegmentedScan::ExclusiveSegmentedSum, cub::DeviceSegmentedScan::InclusiveSegmentedScan
+
+### [CCCL libcu++](https://nvidia.github.io/cccl/libcudacxx/)
+
+cuda::maximum
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+
+cudaDeviceSynchronize, cudaGetDeviceProperties
+
+## Dependencies needed to build/run
+
+[CCCL 3.3+](https://github.com/NVIDIA/cccl). Fetched automatically via CPM at configure time (pinned to `v3.3.3`). Override with `-DCCCL_SOURCE_DIR=/path/to/cccl` to use a local checkout.
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## References (for more details)
+
+[CCCL 3.3 release notes](https://github.com/NVIDIA/cccl/releases), [cub::DeviceSegmentedScan header](https://github.com/NVIDIA/cccl/blob/main/cub/cub/device/device_segmented_scan.cuh)
diff --git a/cpp/4_CUDA_Libraries/cubDeviceSegmentedScan/cubDeviceSegmentedScan.cu b/cpp/4_CUDA_Libraries/cubDeviceSegmentedScan/cubDeviceSegmentedScan.cu
new file mode 100644
index 00000000..653d678b
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/cubDeviceSegmentedScan/cubDeviceSegmentedScan.cu
@@ -0,0 +1,188 @@
+/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This sample demonstrates cub::DeviceSegmentedScan, added in CCCL 3.3.
+ * Two operations are shown: ExclusiveSegmentedSum across three independent
+ * segments, and InclusiveSegmentedScan with a custom binary operator
+ * (running maximum via cuda::maximum<>). Each is verified against a
+ * host reference implementation.
+ */
+
+/* Includes, system */
+#include <algorithm>
+#include <limits>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+/* Includes, cuda */
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+
+/* Includes, cccl */
+#include <cub/device/device_segmented_scan.cuh>
+#include <cuda/functional>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+template <typename T>
+static void print_vec(const char *label, const std::vector<T> &v)
+{
+    printf("  %-24s{", label);
+    for (size_t i = 0; i < v.size(); ++i)
+        printf(" %d", static_cast<int>(v[i]));
+    printf(" }\n");
+}
+
+static std::vector<int> host_exclusive_segmented_sum(const std::vector<int> &input, const std::vector<size_t> &offsets)
+{
+    std::vector<int> out(input.size(), 0);
+    for (size_t s = 0; s + 1 < offsets.size(); ++s) {
+        int running = 0;
+        for (size_t i = offsets[s]; i < offsets[s + 1]; ++i) {
+            out[i] = running;
+            running += input[i];
+        }
+    }
+    return out;
+}
+
+static std::vector<int> host_inclusive_segmented_max(const std::vector<int> &input, const std::vector<size_t> &offsets)
+{
+    std::vector<int> out(input.size(), 0);
+    for (size_t s = 0; s + 1 < offsets.size(); ++s) {
+        int running = std::numeric_limits<int>::min();
+        for (size_t i = offsets[s]; i < offsets[s + 1]; ++i) {
+            running = std::max(running, input[i]);
+            out[i]  = running;
+        }
+    }
+    return out;
+}
+
+static bool run_exclusive_segmented_sum()
+{
+    /* 3 segments: [1,2,3] [4,5] [6,7,8] -> [0,1,3] [0,4] [0,6,13]. */
+    thrust::device_vector<int>    d_in      = {1, 2, 3, 4, 5, 6, 7, 8};
+    thrust::device_vector<size_t> d_offsets = {0, 3, 5, 8};
+    thrust::device_vector<int>    d_out(d_in.size());
+
+    const auto num_segments  = d_offsets.size() - 1;
+    auto       begin_offsets = d_offsets.begin();
+    auto       end_offsets   = d_offsets.begin() + 1;
+
+    size_t temp_bytes = 0;
+    checkCudaErrors(cub::DeviceSegmentedScan::ExclusiveSegmentedSum(
+        nullptr, temp_bytes, d_in.begin(), d_out.begin(), begin_offsets, end_offsets, num_segments));
+    thrust::device_vector<char> temp(temp_bytes);
+    checkCudaErrors(cub::DeviceSegmentedScan::ExclusiveSegmentedSum(thrust::raw_pointer_cast(temp.data()),
+                                                                    temp_bytes,
+                                                                    d_in.begin(),
+                                                                    d_out.begin(),
+                                                                    begin_offsets,
+                                                                    end_offsets,
+                                                                    num_segments));
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    std::vector<int>    h_in(d_in.begin(), d_in.end());
+    std::vector<size_t> h_off(d_offsets.begin(), d_offsets.end());
+    std::vector<int>    got(d_out.begin(), d_out.end());
+    std::vector<int>    expected = host_exclusive_segmented_sum(h_in, h_off);
+
+    printf("cub::DeviceSegmentedScan::ExclusiveSegmentedSum\n");
+    print_vec("input:", h_in);
+    printf("  %-24s{", "offsets:");
+    for (auto o : h_off)
+        printf(" %zu", o);
+    printf(" }\n");
+    print_vec("got:", got);
+    print_vec("expected:", expected);
+    const bool ok = got == expected;
+    printf("  %s\n", ok ? "OK" : "FAIL");
+    return ok;
+}
+
+static bool run_inclusive_segmented_max()
+{
+    /* Same three segments, but compute running max per segment. */
+    thrust::device_vector<int>    d_in      = {3, 1, 4, 5, 2, 9, 7, 8};
+    thrust::device_vector<size_t> d_offsets = {0, 3, 5, 8};
+    thrust::device_vector<int>    d_out(d_in.size());
+
+    const auto num_segments  = d_offsets.size() - 1;
+    auto       begin_offsets = d_offsets.begin();
+    auto       end_offsets   = d_offsets.begin() + 1;
+
+    auto max_op = [] __host__ __device__(int a, int b) -> int { return cuda::maximum<>{}(a, b); };
+
+    size_t temp_bytes = 0;
+    checkCudaErrors(cub::DeviceSegmentedScan::InclusiveSegmentedScan(
+        nullptr, temp_bytes, d_in.begin(), d_out.begin(), begin_offsets, end_offsets, num_segments, max_op));
+    thrust::device_vector<char> temp(temp_bytes);
+    checkCudaErrors(cub::DeviceSegmentedScan::InclusiveSegmentedScan(thrust::raw_pointer_cast(temp.data()),
+                                                                     temp_bytes,
+                                                                     d_in.begin(),
+                                                                     d_out.begin(),
+                                                                     begin_offsets,
+                                                                     end_offsets,
+                                                                     num_segments,
+                                                                     max_op));
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    std::vector<int>    h_in(d_in.begin(), d_in.end());
+    std::vector<size_t> h_off(d_offsets.begin(), d_offsets.end());
+    std::vector<int>    got(d_out.begin(), d_out.end());
+    std::vector<int>    expected = host_inclusive_segmented_max(h_in, h_off);
+
+    printf("cub::DeviceSegmentedScan::InclusiveSegmentedScan (running max)\n");
+    print_vec("input:", h_in);
+    printf("  %-24s{", "offsets:");
+    for (auto o : h_off)
+        printf(" %zu", o);
+    printf(" }\n");
+    print_vec("got:", got);
+    print_vec("expected:", expected);
+    const bool ok = got == expected;
+    printf("  %s\n", ok ? "OK" : "FAIL");
+    return ok;
+}
+
+int main(int argc, char **argv)
+{
+    int devID = findCudaDevice(argc, (const char **)argv);
+    cudaDeviceProp props;
+    checkCudaErrors(cudaGetDeviceProperties(&props, devID));
+    printf("Device: %s (Compute Capability %d.%d)\n\n", props.name, props.major, props.minor);
+
+    bool ok = true;
+    ok &= run_exclusive_segmented_sum();
+    printf("\n");
+    ok &= run_inclusive_segmented_max();
+
+    printf("\n%s\n", ok ? "Done" : "FAILED");
+    return ok ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/cpp/4_CUDA_Libraries/cubDeviceTransform/CMakeLists.txt b/cpp/4_CUDA_Libraries/cubDeviceTransform/CMakeLists.txt
new file mode 100644
index 00000000..ecacbd10
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/cubDeviceTransform/CMakeLists.txt
@@ -0,0 +1,62 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(cubDeviceTransform LANGUAGES C CXX CUDA)
+
+# Disable response file for libraries on QNX as qcc does not support lib paths with double quotes
+if(CMAKE_SYSTEM_NAME STREQUAL "QNX")
+    set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES OFF)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Fetch CCCL via CPM.
+# Override with -DCCCL_SOURCE_DIR=/path/to/cccl to use a local checkout
+set(CCCL_SAMPLES_CCCL_TAG "v3.3.3" CACHE STRING
+    "Tag/branch of NVIDIA/cccl to fetch for the CCCL samples")
+
+if(NOT TARGET CCCL::CCCL)
+    include("${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/CPM.cmake")
+    if(DEFINED CCCL_SOURCE_DIR AND NOT CCCL_SOURCE_DIR STREQUAL "")
+        CPMAddPackage(NAME CCCL SOURCE_DIR "${CCCL_SOURCE_DIR}")
+    else()
+        CPMAddPackage(
+            NAME CCCL
+            GIT_REPOSITORY "https://github.com/NVIDIA/cccl"
+            GIT_TAG "${CCCL_SAMPLES_CCCL_TAG}"
+        )
+    endif()
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+# Add target for cubDeviceTransform
+add_executable(cubDeviceTransform cubDeviceTransform.cu)
+
+target_compile_options(cubDeviceTransform PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+
+target_compile_features(cubDeviceTransform PRIVATE cxx_std_17 cuda_std_17)
+
+set_target_properties(cubDeviceTransform PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+target_link_libraries(cubDeviceTransform PRIVATE
+    CUDA::cudart
+    CCCL::CCCL
+)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/4_CUDA_Libraries/cubDeviceTransform/README.md b/cpp/4_CUDA_Libraries/cubDeviceTransform/README.md
new file mode 100644
index 00000000..084fff78
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/cubDeviceTransform/README.md
@@ -0,0 +1,48 @@
+# cubDeviceTransform - CUB DeviceTransform N-to-M
+
+## Description
+
+This sample demonstrates `cub::DeviceTransform` in its N-input / M-output form. A single device-wide call reads from N input sequences and writes to M output sequences, driven by a user-provided op that returns a `cuda::std::tuple` of M values. Two cases are shown: N=3 inputs producing 1 output, and N=2 inputs producing 2 outputs (sum and difference in one fused pass).
+
+## Key Concepts
+
+CCCL 3.3, CUB Device Algorithms, Fused Elementwise Transforms, Counting Iterators
+
+## Supported SM Architectures
+
+[SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus) [SM 10.0 ](https://developer.nvidia.com/cuda-gpus) [SM 11.0 ](https://developer.nvidia.com/cuda-gpus) [SM 12.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, aarch64
+
+## CUDA APIs involved
+
+### [CCCL CUB](https://nvidia.github.io/cccl/cub/)
+
+cub::DeviceTransform::Transform
+
+### [CCCL libcu++](https://nvidia.github.io/cccl/libcudacxx/)
+
+cuda::counting_iterator, cuda::std::tuple
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+
+cudaDeviceSynchronize, cudaGetDeviceProperties
+
+## Dependencies needed to build/run
+
+[CCCL 3.3+](https://github.com/NVIDIA/cccl). Fetched automatically via CPM at configure time (pinned to `v3.3.3`). Override with `-DCCCL_SOURCE_DIR=/path/to/cccl` to use a local checkout.
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## References (for more details)
+
+[CCCL 3.3 release notes](https://github.com/NVIDIA/cccl/releases), [cub::DeviceTransform header](https://github.com/NVIDIA/cccl/blob/main/cub/cub/device/device_transform.cuh)
diff --git a/cpp/4_CUDA_Libraries/cubDeviceTransform/cubDeviceTransform.cu b/cpp/4_CUDA_Libraries/cubDeviceTransform/cubDeviceTransform.cu
new file mode 100644
index 00000000..9ab569f0
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/cubDeviceTransform/cubDeviceTransform.cu
@@ -0,0 +1,155 @@
+/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This sample demonstrates cub::DeviceTransform in its N-input/M-output
+ * form (extended in CCCL 3.3). A single device-wide call reads from
+ * N input sequences and writes to M output sequences, driven by a
+ * user-provided op that returns a tuple of M values. Two cases are
+ * shown: N=3 -> 1 and N=2 -> 2. Results are verified against a host
+ * reference.
+ */
+
+/* Includes, system */
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+/* Includes, cuda */
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+
+/* Includes, cccl */
+#include <cub/device/device_transform.cuh>
+#include <cuda/iterator>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+static bool run_n_to_one_transform()
+{
+    /* result[i] = (a[i] + b[i]) * c[i], with c = counting_iterator<int>(100). */
+    thrust::device_vector<int>   a        = {0, -2, 5, 3};
+    thrust::device_vector<float> b        = {5.2f, 3.1f, -1.1f, 3.0f};
+    auto                         counting = cuda::counting_iterator<int>{100};
+    thrust::device_vector<int>   result(a.size());
+
+    auto op = [] __host__ __device__(int x, float y, int z) -> int {
+        return static_cast<int>((x + y) * z);
+    };
+
+    checkCudaErrors(cub::DeviceTransform::Transform(
+        cuda::std::tuple{a.begin(), b.begin(), counting}, result.begin(), a.size(), op));
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    thrust::host_vector<int>   ha  = a;
+    thrust::host_vector<float> hb  = b;
+    thrust::host_vector<int>   got = result;
+    std::vector<int>           expected(a.size());
+    for (size_t i = 0; i < a.size(); ++i) {
+        expected[i] = static_cast<int>((ha[i] + hb[i]) * static_cast<int>(100 + i));
+    }
+
+    bool ok = true;
+    printf("cub::DeviceTransform::Transform (N=3 inputs -> 1 output)\n");
+    printf("  result = (a + b) * c with c = counting_iterator(100)\n");
+    printf("  got      = {");
+    for (size_t i = 0; i < got.size(); ++i) {
+        printf(" %d", got[i]);
+        if (got[i] != expected[i])
+            ok = false;
+    }
+    printf(" }\n  expected = {");
+    for (size_t i = 0; i < expected.size(); ++i)
+        printf(" %d", expected[i]);
+    printf(" }  %s\n", ok ? "OK" : "FAIL");
+    return ok;
+}
+
+static bool run_n_to_m_transform()
+{
+    /* (sum[i], diff[i]) = (a[i] + b[i], a[i] - b[i]) in one pass. */
+    thrust::device_vector<int> a = {1, 5, 10, 7, 3};
+    thrust::device_vector<int> b = {4, 2, 8, 1, 9};
+    thrust::device_vector<int> sum(a.size());
+    thrust::device_vector<int> diff(a.size());
+
+    auto op = [] __host__ __device__(int x, int y) -> cuda::std::tuple<int, int> {
+        return {x + y, x - y};
+    };
+
+    checkCudaErrors(cub::DeviceTransform::Transform(cuda::std::tuple{a.begin(), b.begin()},
+                                                    cuda::std::tuple{sum.begin(), diff.begin()},
+                                                    a.size(),
+                                                    op));
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    thrust::host_vector<int> ha = a, hb = b, got_sum = sum, got_diff = diff;
+    std::vector<int>         exp_sum(a.size()), exp_diff(a.size());
+    for (size_t i = 0; i < a.size(); ++i) {
+        exp_sum[i]  = ha[i] + hb[i];
+        exp_diff[i] = ha[i] - hb[i];
+    }
+
+    bool ok = true;
+    printf("cub::DeviceTransform::Transform (N=2 inputs -> M=2 outputs)\n");
+    printf("  op returns cuda::std::tuple{a + b, a - b}\n");
+    printf("  sum  = {");
+    for (size_t i = 0; i < got_sum.size(); ++i) {
+        printf(" %d", got_sum[i]);
+        if (got_sum[i] != exp_sum[i])
+            ok = false;
+    }
+    printf(" }  expected = {");
+    for (auto v : exp_sum)
+        printf(" %d", v);
+    printf(" }\n  diff = {");
+    for (size_t i = 0; i < got_diff.size(); ++i) {
+        printf(" %d", got_diff[i]);
+        if (got_diff[i] != exp_diff[i])
+            ok = false;
+    }
+    printf(" }  expected = {");
+    for (auto v : exp_diff)
+        printf(" %d", v);
+    printf(" }  %s\n", ok ? "OK" : "FAIL");
+    return ok;
+}
+
+int main(int argc, char **argv)
+{
+    int devID = findCudaDevice(argc, (const char **)argv);
+    cudaDeviceProp props;
+    checkCudaErrors(cudaGetDeviceProperties(&props, devID));
+    printf("Device: %s (Compute Capability %d.%d)\n\n", props.name, props.major, props.minor);
+
+    bool ok = true;
+    ok &= run_n_to_one_transform();
+    printf("\n");
+    ok &= run_n_to_m_transform();
+
+    printf("\n%s\n", ok ? "Done" : "FAILED");
+    return ok ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/cpp/4_CUDA_Libraries/libcuxxMdspan/CMakeLists.txt b/cpp/4_CUDA_Libraries/libcuxxMdspan/CMakeLists.txt
new file mode 100644
index 00000000..5f66f5f1
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/libcuxxMdspan/CMakeLists.txt
@@ -0,0 +1,92 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(libcuxxMdspan LANGUAGES C CXX CUDA)
+
+# Disable response file for libraries on QNX as qcc does not support lib paths with double quotes
+if(CMAKE_SYSTEM_NAME STREQUAL "QNX")
+    set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES OFF)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Fetch CCCL (CUB + libcu++ + Thrust) via CPM. The toolkit that ships
+# with CUDA 13.2 bundles CCCL 3.2, but this sample uses APIs added in
+# CCCL 3.3. Pinning the tag here lets the sample build on any toolkit
+# with a usable nvcc. Override with -DCCCL_SOURCE_DIR=/path/to/cccl
+# to use a local checkout instead of fetching from GitHub.
+set(CCCL_SAMPLES_CCCL_TAG "v3.3.3" CACHE STRING
+    "Tag/branch of NVIDIA/cccl to fetch for the CCCL samples")
+
+if(NOT TARGET CCCL::CCCL)
+    include("${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/CPM.cmake")
+    if(DEFINED CCCL_SOURCE_DIR AND NOT CCCL_SOURCE_DIR STREQUAL "")
+        CPMAddPackage(NAME CCCL SOURCE_DIR "${CCCL_SOURCE_DIR}")
+    else()
+        CPMAddPackage(
+            NAME CCCL
+            GIT_REPOSITORY "https://github.com/NVIDIA/cccl"
+            GIT_TAG "${CCCL_SAMPLES_CCCL_TAG}"
+        )
+    endif()
+endif()
+
+# DLPack headers are required for cuda::to_device_mdspan and
+# cuda::to_dlpack_tensor. Fetch via CPM (override with DLPACK_SOURCE_DIR).
+set(CCCL_SAMPLES_DLPACK_TAG "v1.3" CACHE STRING
+    "Tag of dmlc/dlpack to fetch for libcuxxMdspan")
+
+if(NOT TARGET dlpack::dlpack)
+    if(NOT COMMAND CPMAddPackage)
+        include("${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/CPM.cmake")
+    endif()
+    if(DEFINED DLPACK_SOURCE_DIR AND NOT DLPACK_SOURCE_DIR STREQUAL "")
+        CPMAddPackage(NAME dlpack SOURCE_DIR "${DLPACK_SOURCE_DIR}" DOWNLOAD_ONLY YES)
+    else()
+        CPMAddPackage(
+            NAME dlpack
+            GIT_REPOSITORY "https://github.com/dmlc/dlpack"
+            GIT_TAG "${CCCL_SAMPLES_DLPACK_TAG}"
+            DOWNLOAD_ONLY YES
+        )
+    endif()
+    if(dlpack_ADDED)
+        add_library(dlpack_headers INTERFACE)
+        target_include_directories(dlpack_headers INTERFACE "${dlpack_SOURCE_DIR}/include")
+        add_library(dlpack::dlpack ALIAS dlpack_headers)
+    endif()
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+# Add target for libcuxxMdspan
+add_executable(libcuxxMdspan libcuxxMdspan.cu)
+
+target_compile_options(libcuxxMdspan PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+
+target_compile_features(libcuxxMdspan PRIVATE cxx_std_17 cuda_std_17)
+
+set_target_properties(libcuxxMdspan PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+target_link_libraries(libcuxxMdspan PRIVATE
+    CUDA::cudart
+    CCCL::CCCL
+    dlpack::dlpack
+)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/4_CUDA_Libraries/libcuxxMdspan/README.md b/cpp/4_CUDA_Libraries/libcuxxMdspan/README.md
new file mode 100644
index 00000000..178ec839
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/libcuxxMdspan/README.md
@@ -0,0 +1,44 @@
+# libcuxxMdspan - libcu++ mdspan Interop (DLPack + shared_memory_mdspan)
+
+## Description
+
+This sample demonstrates two mdspan-centric features CCCL: DLPack <-> `cuda::std::mdspan` bridging via `cuda::to_device_mdspan` / `cuda::to_dlpack_tensor` (the tensor-interchange protocol used by PyTorch, JAX, CuPy, and others), and `cuda::shared_memory_mdspan` for multi-dimensional views of shared-memory tiles with address-space-safe accessors. A small matrix is built, wrapped in a DLTensor, converted to a `device_mdspan`, scaled row-wise, and transposed through a `shared_memory_mdspan` tile. The output mdspan is converted back to DLPack and its metadata is printed.
+
+## Key Concepts
+
+CCCL 3.3, libcu++ mdspan, DLPack Interoperability, Shared Memory Views
+
+## Supported SM Architectures
+
+[SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus) [SM 10.0 ](https://developer.nvidia.com/cuda-gpus) [SM 11.0 ](https://developer.nvidia.com/cuda-gpus) [SM 12.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, aarch64
+
+## CUDA APIs involved
+
+### [CCCL libcu++](https://nvidia.github.io/cccl/libcudacxx/)
+
+cuda::to_device_mdspan, cuda::to_dlpack_tensor, cuda::device_mdspan, cuda::shared_memory_mdspan, cuda::std::mdspan
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+
+cudaMalloc, cudaFree, cudaMemcpy, cudaMemset, cudaDeviceSynchronize, cudaGetDeviceProperties
+
+## Dependencies needed to build/run
+
+[CCCL 3.3+](https://github.com/NVIDIA/cccl), [DLPack 1.2+](https://github.com/dmlc/dlpack). Both fetched automatically via CPM at configure time (pinned to `v3.3.3` and `v1.3` respectively). Override with `-DCCCL_SOURCE_DIR=/path/to/cccl` and `-DDLPACK_SOURCE_DIR=/path/to/dlpack` to use local checkouts.
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## References (for more details)
+
+[CCCL 3.3 release notes](https://github.com/NVIDIA/cccl/releases), [cuda::to_device_mdspan header](https://github.com/NVIDIA/cccl/blob/main/libcudacxx/include/cuda/__mdspan/dlpack_to_mdspan.h), [cuda::shared_memory_mdspan docs](https://nvidia.github.io/cccl/libcudacxx/extended_api/mdspan/shared_memory_accessor.html), [DLPack specification](https://dmlc.github.io/dlpack/latest/)
diff --git a/cpp/4_CUDA_Libraries/libcuxxMdspan/libcuxxMdspan.cu b/cpp/4_CUDA_Libraries/libcuxxMdspan/libcuxxMdspan.cu
new file mode 100644
index 00000000..90016faa
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/libcuxxMdspan/libcuxxMdspan.cu
@@ -0,0 +1,246 @@
+/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This sample demonstrates two mdspan-centric features from CCCL 3.3:
+ *
+ *   1. DLPack <-> cuda::std::mdspan bridging through
+ *      cuda::to_device_mdspan<T, Rank>(DLTensor)  ->  cuda::device_mdspan
+ *      cuda::to_dlpack_tensor(device_mdspan)      ->  DLManagedTensor
+ *      The DLPack format is the interchange protocol used by PyTorch,
+ *      JAX, CuPy, and other frameworks; cuda::device_mdspan is the
+ *      device-side view with rich shape/stride metadata for kernels.
+ *
+ *   2. cuda::shared_memory_mdspan: a multi-dimensional view over a
+ *      shared-memory tile. The accessor guarantees shared-memory
+ *      load/store instructions and adds address-space safety checks.
+ *
+ * A sample matrix is built on the device, wrapped in a DLTensor,
+ * converted to a cuda::device_mdspan, and two kernels run against it:
+ * scale_rows_kernel multiplies row i by (i + 1), and
+ * shared_tile_transpose_kernel uses a cuda::shared_memory_mdspan to
+ * transpose a block-sized tile through shared memory. The output
+ * mdspan is then converted back to DLPack metadata and printed.
+ */
+
+/* Includes, system */
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+/* Includes, cuda */
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+
+/* Includes, cccl */
+#include <cuda/mdspan>
+#include <cuda/std/array>
+#include <cuda/std/cstdint>
+#include <cuda/std/mdspan>
+
+#define ROWS 8
+#define COLS 8
+#define TILE 8 /* matches ROWS / COLS for simplicity */
+
+using extents2d = cuda::std::dextents<cuda::std::size_t, 2>;
+
+/* Kernel A: multiply row i of a 2-D device_mdspan by (i + 1).  Templated
+ * on the mdspan type so it accepts the exact type produced by
+ * cuda::to_device_mdspan (which uses layout_stride_relaxed and int64_t
+ * extents). */
+template <typename Tensor>
+__global__ void scale_rows_kernel(Tensor tensor)
+{
+    const int r = blockIdx.y * blockDim.y + threadIdx.y;
+    const int c = blockIdx.x * blockDim.x + threadIdx.x;
+    if (r < static_cast<int>(tensor.extent(0)) && c < static_cast<int>(tensor.extent(1))) {
+        tensor(r, c) *= static_cast<float>(r + 1);
+    }
+}
+
+/* Kernel B: block-tile transpose driven by a shared_memory_mdspan.
+ * Each block loads a TILE x TILE tile from the input into shared memory
+ * through a cuda::shared_memory_mdspan, transposes in shared, and writes
+ * to the output. */
+template <typename InTensor, typename OutTensor>
+__global__ void shared_tile_transpose_kernel(InTensor in, OutTensor out)
+{
+    __shared__ float smem_storage[TILE * TILE];
+    cuda::shared_memory_mdspan smem(smem_storage, cuda::std::dextents<cuda::std::size_t, 2>{TILE, TILE});
+
+    const int tr = threadIdx.y;
+    const int tc = threadIdx.x;
+    const int r  = blockIdx.y * TILE + tr;
+    const int c  = blockIdx.x * TILE + tc;
+
+    if (r < static_cast<int>(in.extent(0)) && c < static_cast<int>(in.extent(1))) {
+        smem(tr, tc) = in(r, c);
+    }
+    __syncthreads();
+
+    const int r_out = blockIdx.x * TILE + tr;
+    const int c_out = blockIdx.y * TILE + tc;
+    if (r_out < static_cast<int>(out.extent(0)) && c_out < static_cast<int>(out.extent(1))) {
+        out(r_out, c_out) = smem(tc, tr);
+    }
+}
+
+struct DLTensorStorage
+{
+    ::DLTensor                              tensor{};
+    cuda::std::array<cuda::std::int64_t, 2> shape{};
+    cuda::std::array<cuda::std::int64_t, 2> strides{};
+};
+
+static DLTensorStorage make_row_major_dltensor(float *device_ptr, int rows, int cols, int device_ordinal)
+{
+    DLTensorStorage s;
+    s.shape              = {rows, cols};
+    s.strides            = {cols, 1};
+    s.tensor.data        = device_ptr;
+    s.tensor.device      = ::DLDevice{::kDLCUDA, device_ordinal};
+    s.tensor.ndim        = 2;
+    s.tensor.dtype       = ::DLDataType{::DLDataTypeCode::kDLFloat, 32, 1};
+    s.tensor.shape       = s.shape.data();
+    s.tensor.strides     = s.strides.data();
+    s.tensor.byte_offset = 0;
+    return s;
+}
+
+int main(int argc, char **argv)
+{
+    int devID = findCudaDevice(argc, (const char **)argv);
+    cudaDeviceProp props;
+    checkCudaErrors(cudaGetDeviceProperties(&props, devID));
+    printf("Device: %s (Compute Capability %d.%d)\n\n", props.name, props.major, props.minor);
+
+    float       *d_in  = nullptr;
+    float       *d_out = nullptr;
+    const size_t nelem = static_cast<size_t>(ROWS) * COLS;
+    checkCudaErrors(cudaMalloc(&d_in, nelem * sizeof(float)));
+    checkCudaErrors(cudaMalloc(&d_out, nelem * sizeof(float)));
+
+    std::vector<float> host(nelem);
+    for (int r = 0; r < ROWS; ++r) {
+        for (int c = 0; c < COLS; ++c) {
+            host[r * COLS + c] = static_cast<float>(r * COLS + c);
+        }
+    }
+    checkCudaErrors(cudaMemcpy(d_in, host.data(), nelem * sizeof(float), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemset(d_out, 0, nelem * sizeof(float)));
+
+    DLTensorStorage in_dl  = make_row_major_dltensor(d_in, ROWS, COLS, devID);
+    DLTensorStorage out_dl = make_row_major_dltensor(d_out, ROWS, COLS, devID);
+
+    auto in_md  = cuda::to_device_mdspan<float, 2>(in_dl.tensor);
+    auto out_md = cuda::to_device_mdspan<float, 2>(out_dl.tensor);
+
+    printf("cuda::to_device_mdspan produced a 2-D device_mdspan of shape (%zu, %zu)\n\n",
+           in_md.extent(0),
+           in_md.extent(1));
+
+    dim3 block(8, 8);
+    dim3 grid((COLS + block.x - 1) / block.x, (ROWS + block.y - 1) / block.y);
+    scale_rows_kernel<<<grid, block>>>(in_md);
+    checkCudaErrors(cudaGetLastError());
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    std::vector<float> scaled(nelem);
+    checkCudaErrors(cudaMemcpy(scaled.data(), d_in, nelem * sizeof(float), cudaMemcpyDeviceToHost));
+    bool scale_ok = true;
+    for (int r = 0; r < ROWS && scale_ok; ++r) {
+        for (int c = 0; c < COLS && scale_ok; ++c) {
+            const float expect = static_cast<float>((r * COLS + c) * (r + 1));
+            if (scaled[r * COLS + c] != expect) {
+                printf("scale_rows mismatch at (%d,%d): got %g expected %g\n",
+                       r,
+                       c,
+                       scaled[r * COLS + c],
+                       expect);
+                scale_ok = false;
+            }
+        }
+    }
+    if (scale_ok) {
+        printf("scale_rows kernel: OK (row i scaled by i+1 via cuda::device_mdspan)\n");
+    }
+
+    cuda::device_mdspan<const float, extents2d> in_md_const(d_in, extents2d{ROWS, COLS});
+    cuda::device_mdspan<float, extents2d>       out_md_rw(d_out, extents2d{ROWS, COLS});
+
+    dim3 tile_block(TILE, TILE);
+    dim3 tile_grid((COLS + TILE - 1) / TILE, (ROWS + TILE - 1) / TILE);
+    shared_tile_transpose_kernel<<<tile_grid, tile_block>>>(in_md_const, out_md_rw);
+    checkCudaErrors(cudaGetLastError());
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    std::vector<float> transposed(nelem);
+    checkCudaErrors(cudaMemcpy(transposed.data(), d_out, nelem * sizeof(float), cudaMemcpyDeviceToHost));
+    bool tp_ok = true;
+    for (int r = 0; r < ROWS && tp_ok; ++r) {
+        for (int c = 0; c < COLS && tp_ok; ++c) {
+            const float expect = scaled[c * COLS + r];
+            if (transposed[r * COLS + c] != expect) {
+                printf("transpose mismatch at (%d,%d): got %g expected %g\n",
+                       r,
+                       c,
+                       transposed[r * COLS + c],
+                       expect);
+                tp_ok = false;
+            }
+        }
+    }
+    if (tp_ok) {
+        printf("shared_tile_transpose kernel: OK (tile transpose via cuda::shared_memory_mdspan)\n");
+    }
+
+    auto        dl_wrapper = cuda::to_dlpack_tensor(out_md);
+    const auto &dltensor   = dl_wrapper.get();
+    printf("\ncuda::to_dlpack_tensor metadata:\n");
+    printf("  device       : kDLCUDA (ordinal %d)\n", dltensor.device.device_id);
+    printf("  ndim         : %d\n", dltensor.ndim);
+    printf("  dtype        : code=%u bits=%u lanes=%u\n",
+           static_cast<unsigned>(dltensor.dtype.code),
+           static_cast<unsigned>(dltensor.dtype.bits),
+           static_cast<unsigned>(dltensor.dtype.lanes));
+    printf("  shape        : [%lld, %lld]\n",
+           static_cast<long long>(dltensor.shape[0]),
+           static_cast<long long>(dltensor.shape[1]));
+    if (dltensor.strides != nullptr) {
+        printf("  strides      : [%lld, %lld]\n",
+               static_cast<long long>(dltensor.strides[0]),
+               static_cast<long long>(dltensor.strides[1]));
+    }
+
+    checkCudaErrors(cudaFree(d_in));
+    checkCudaErrors(cudaFree(d_out));
+
+    if (!scale_ok || !tp_ok) {
+        return EXIT_FAILURE;
+    }
+    printf("\nDone\n");
+    return EXIT_SUCCESS;
+}
diff --git a/cpp/4_CUDA_Libraries/libcuxxRandom/CMakeLists.txt b/cpp/4_CUDA_Libraries/libcuxxRandom/CMakeLists.txt
new file mode 100644
index 00000000..1164c560
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/libcuxxRandom/CMakeLists.txt
@@ -0,0 +1,65 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(libcuxxRandom LANGUAGES C CXX CUDA)
+
+# Disable response file for libraries on QNX as qcc does not support lib paths with double quotes
+if(CMAKE_SYSTEM_NAME STREQUAL "QNX")
+    set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES OFF)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 89 90 100 110 120)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Fetch CCCL (CUB + libcu++ + Thrust) via CPM. The toolkit that ships
+# with CUDA 13.2 bundles CCCL 3.2, but this sample uses APIs added in
+# CCCL 3.3. Pinning the tag here lets the sample build on any toolkit
+# with a usable nvcc. Override with -DCCCL_SOURCE_DIR=/path/to/cccl
+# to use a local checkout instead of fetching from GitHub.
+set(CCCL_SAMPLES_CCCL_TAG "v3.3.3" CACHE STRING
+    "Tag/branch of NVIDIA/cccl to fetch for the CCCL samples")
+
+if(NOT TARGET CCCL::CCCL)
+    include("${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/CPM.cmake")
+    if(DEFINED CCCL_SOURCE_DIR AND NOT CCCL_SOURCE_DIR STREQUAL "")
+        CPMAddPackage(NAME CCCL SOURCE_DIR "${CCCL_SOURCE_DIR}")
+    else()
+        CPMAddPackage(
+            NAME CCCL
+            GIT_REPOSITORY "https://github.com/NVIDIA/cccl"
+            GIT_TAG "${CCCL_SAMPLES_CCCL_TAG}"
+        )
+    endif()
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+# Add target for libcuxxRandom
+add_executable(libcuxxRandom libcuxxRandom.cu)
+
+target_compile_options(libcuxxRandom PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+
+target_compile_features(libcuxxRandom PRIVATE cxx_std_17 cuda_std_17)
+
+set_target_properties(libcuxxRandom PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+target_link_libraries(libcuxxRandom PRIVATE
+    CUDA::cudart
+    CCCL::CCCL
+)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/4_CUDA_Libraries/libcuxxRandom/README.md b/cpp/4_CUDA_Libraries/libcuxxRandom/README.md
new file mode 100644
index 00000000..adb6b816
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/libcuxxRandom/README.md
@@ -0,0 +1,44 @@
+# libcuxxRandom - libcu++ Random Distributions
+
+## Description
+
+This sample demonstrates the random-number facilities added to libcu++ in CCCL. `<cuda/std/random>` now offers host- and device-compatible implementations of the standard C++ distributions (uniform, normal, Poisson, Bernoulli, and more) and backports the C++26 `cuda::std::philox4x32` / `philox4x64` engines. `<cuda/random>` adds `cuda::pcg64` as an NVIDIA extension (the same generator NumPy uses by default). A kernel draws samples on each thread and the host computes empirical statistics, comparing them to the theoretical mean / variance / probability.
+
+## Key Concepts
+
+CCCL 3.3, libcu++ Random, PCG, Philox, Device-Side PRNG
+
+## Supported SM Architectures
+
+[SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus) [SM 10.0 ](https://developer.nvidia.com/cuda-gpus) [SM 11.0 ](https://developer.nvidia.com/cuda-gpus) [SM 12.0 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, aarch64
+
+## CUDA APIs involved
+
+### [CCCL libcu++](https://nvidia.github.io/cccl/libcudacxx/)
+
+cuda::pcg64, cuda::std::philox4x32, cuda::std::uniform_real_distribution, cuda::std::normal_distribution, cuda::std::poisson_distribution, cuda::std::bernoulli_distribution
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+
+cudaMalloc, cudaFree, cudaMemcpy, cudaDeviceSynchronize, cudaGetDeviceProperties
+
+## Dependencies needed to build/run
+
+[CCCL 3.3+](https://github.com/NVIDIA/cccl). Fetched automatically via CPM at configure time (pinned to `v3.3.3`). Override with `-DCCCL_SOURCE_DIR=/path/to/cccl` to use a local checkout.
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## References (for more details)
+
+[CCCL 3.3 release notes](https://github.com/NVIDIA/cccl/releases), [cuda::pcg64 header](https://github.com/NVIDIA/cccl/blob/main/libcudacxx/include/cuda/__random/pcg_engine.h), [NumPy PCG64](https://numpy.org/doc/stable/reference/random/bit_generators/pcg64.html)
diff --git a/cpp/4_CUDA_Libraries/libcuxxRandom/libcuxxRandom.cu b/cpp/4_CUDA_Libraries/libcuxxRandom/libcuxxRandom.cu
new file mode 100644
index 00000000..db9400f2
--- /dev/null
+++ b/cpp/4_CUDA_Libraries/libcuxxRandom/libcuxxRandom.cu
@@ -0,0 +1,180 @@
+/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This sample demonstrates the random-number facilities added to libcu++
+ * in CCCL 3.3: <cuda/std/random> now offers host- and device-compatible
+ * implementations of the standard C++ distributions (uniform, normal,
+ * Poisson, Bernoulli, ...), and backports the C++26 Philox counter-based
+ * engines. <cuda/random> adds cuda::pcg64 as an NVIDIA extension (the
+ * same generator NumPy uses by default).
+ *
+ * A kernel draws many samples from four different distributions on each
+ * thread and the host computes empirical summary statistics, comparing
+ * them to the theoretical mean / variance / probability.
+ */
+
+/* Includes, system */
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+
+/* Includes, cuda */
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+
+/* Includes, cccl */
+#include <cuda/random>
+#include <cuda/std/random>
+
+#define THREADS_PER_BLOCK  256
+#define SAMPLES_PER_THREAD 256
+
+/* Per-thread kernel: seed a PCG engine, draw samples from four
+ * distributions, and also pull Philox output through a Bernoulli dist
+ * to show that distributions work with any engine. */
+__global__ void sample_kernel(unsigned long long base_seed,
+                              int                num_samples_per_thread,
+                              float             *uniform_out,
+                              float             *normal_out,
+                              int               *poisson_out,
+                              int               *bernoulli_out)
+{
+    const int tid           = blockIdx.x * blockDim.x + threadIdx.x;
+    const int total_threads = gridDim.x * blockDim.x;
+
+    cuda::pcg64 rng(base_seed + static_cast<unsigned long long>(tid));
+
+    cuda::std::uniform_real_distribution<float> uniform_dist(0.0f, 1.0f);
+    cuda::std::normal_distribution<float>       normal_dist(0.0f, 1.0f);
+    cuda::std::poisson_distribution<int>        poisson_dist(4.0);
+    cuda::std::bernoulli_distribution           bernoulli_dist(0.25);
+
+    cuda::std::philox4x32 philox(static_cast<cuda::std::uint32_t>(base_seed + 17u + tid));
+
+    for (int i = 0; i < num_samples_per_thread; ++i) {
+        const int idx      = i * total_threads + tid;
+        uniform_out[idx]   = uniform_dist(rng);
+        normal_out[idx]    = normal_dist(rng);
+        poisson_out[idx]   = poisson_dist(rng);
+        bernoulli_out[idx] = bernoulli_dist(philox) ? 1 : 0;
+    }
+}
+
+template <typename T>
+static void summarize(const std::vector<T> &samples, double expected_mean, double expected_var, const char *label)
+{
+    const size_t n   = samples.size();
+    double       sum = 0.0;
+    for (const auto v : samples)
+        sum += static_cast<double>(v);
+    const double mean = sum / static_cast<double>(n);
+    double       sq   = 0.0;
+    for (const auto v : samples) {
+        const double d = static_cast<double>(v) - mean;
+        sq += d * d;
+    }
+    const double var = sq / static_cast<double>(n - 1);
+    printf("%-24s n=%zu  mean=%.4f (exp %.4f)   var=%.4f (exp %.4f)\n",
+           label,
+           n,
+           mean,
+           expected_mean,
+           var,
+           expected_var);
+}
+
+static void summarize_bernoulli(const std::vector<int> &samples, double expected_p)
+{
+    long long ones = 0;
+    for (int v : samples)
+        ones += v;
+    const double p = static_cast<double>(ones) / static_cast<double>(samples.size());
+    printf("%-24s n=%zu  p(1)=%.4f (exp %.4f)\n", "bernoulli(0.25):", samples.size(), p, expected_p);
+}
+
+int main(int argc, char **argv)
+{
+    int num_blocks = 64;
+    for (int i = 1; i + 1 < argc; ++i) {
+        if (strcmp(argv[i], "--blocks") == 0)
+            num_blocks = atoi(argv[i + 1]);
+    }
+    if (num_blocks <= 0)
+        num_blocks = 1;
+
+    int devID = findCudaDevice(argc, (const char **)argv);
+    cudaDeviceProp props;
+    checkCudaErrors(cudaGetDeviceProperties(&props, devID));
+    printf("Device: %s (Compute Capability %d.%d)\n\n", props.name, props.major, props.minor);
+
+    const int    total_threads = num_blocks * THREADS_PER_BLOCK;
+    const size_t n             = static_cast<size_t>(total_threads) * SAMPLES_PER_THREAD;
+    printf("Drawing %zu samples per distribution (%d blocks x %d threads x %d samples/thread)\n\n",
+           n,
+           num_blocks,
+           THREADS_PER_BLOCK,
+           SAMPLES_PER_THREAD);
+
+    float *d_uniform   = nullptr;
+    float *d_normal    = nullptr;
+    int   *d_poisson   = nullptr;
+    int   *d_bernoulli = nullptr;
+    checkCudaErrors(cudaMalloc(&d_uniform, n * sizeof(float)));
+    checkCudaErrors(cudaMalloc(&d_normal, n * sizeof(float)));
+    checkCudaErrors(cudaMalloc(&d_poisson, n * sizeof(int)));
+    checkCudaErrors(cudaMalloc(&d_bernoulli, n * sizeof(int)));
+
+    const unsigned long long seed = 0xC0FFEE00ULL;
+    sample_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(
+        seed, SAMPLES_PER_THREAD, d_uniform, d_normal, d_poisson, d_bernoulli);
+    checkCudaErrors(cudaGetLastError());
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    std::vector<float> uniform(n), normal(n);
+    std::vector<int>   poisson(n), bernoulli(n);
+    checkCudaErrors(cudaMemcpy(uniform.data(), d_uniform, n * sizeof(float), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(normal.data(), d_normal, n * sizeof(float), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(poisson.data(), d_poisson, n * sizeof(int), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(bernoulli.data(), d_bernoulli, n * sizeof(int), cudaMemcpyDeviceToHost));
+
+    summarize(uniform, /*mean=*/0.5, /*var=*/1.0 / 12.0, "uniform(0,1):");
+    summarize(normal, /*mean=*/0.0, /*var=*/1.0, "normal(0,1):");
+    summarize(poisson, /*mean=*/4.0, /*var=*/4.0, "poisson(lambda=4):");
+    summarize_bernoulli(bernoulli, /*p=*/0.25);
+
+    printf("\nEngines exercised: cuda::pcg64 (NumPy-compatible) and cuda::std::philox4x32 (C++26)\n");
+
+    checkCudaErrors(cudaFree(d_uniform));
+    checkCudaErrors(cudaFree(d_normal));
+    checkCudaErrors(cudaFree(d_poisson));
+    checkCudaErrors(cudaFree(d_bernoulli));
+
+    printf("Done\n");
+    return EXIT_SUCCESS;
+}
diff --git a/cpp/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES.cpp b/cpp/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES.cpp
index e3279ea2..a10063b1 100644
--- a/cpp/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES.cpp
+++ b/cpp/8_Platform_Specific/Tegra/fluidsGLES/fluidsGLES.cpp
@@ -461,8 +461,6 @@ void autoTest(char **argv)
 // Run fluids Simulation
 bool runFluidsSimulation(int argc, char **argv, char *ref_file)
 {
-    // Create the CUTIL timer
-    sdkCreateTimer(&timer);
 
 
     if (ref_file != NULL) {
diff --git a/cpp/9_CUDA_Tile/Benchmark_Common/benchmark.h b/cpp/9_CUDA_Tile/Benchmark_Common/benchmark.h
new file mode 100644
index 00000000..d9bb25f3
--- /dev/null
+++ b/cpp/9_CUDA_Tile/Benchmark_Common/benchmark.h
@@ -0,0 +1,206 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This file provides common benchmark utilities for CUDA Tile C++
+ * microbenchmarks.
+ */
+
+#ifndef CUDA_TILE_BENCHMARK_H
+#define CUDA_TILE_BENCHMARK_H
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// benchmark configuration
+// global settings for benchmark behavior, controllable via command line
+struct BenchmarkConfig {
+    bool use_validation = false;   // --validate enables CPU cross-validation
+    int warmup_iters = 5;          // --warmup=N (0 to disable)
+    int bench_iters = 20;          // --iters=N, -i N
+};
+
+inline BenchmarkConfig& bench_config() {
+    static BenchmarkConfig config;
+    return config;
+}
+
+// convenience accessors
+inline bool use_validation() { return bench_config().use_validation; }
+inline int warmup_iters() { return bench_config().warmup_iters; }
+inline int bench_iters() { return bench_config().bench_iters; }
+
+// Parse command line options. Call from main() before benchmarks.
+inline void parse_benchmark_args(int argc, char** argv) {
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "--validate") == 0) {
+            bench_config().use_validation = true;
+        } else if (strcmp(argv[i], "--skip-warmup") == 0) {
+            bench_config().warmup_iters = 0;
+        } else if (strncmp(argv[i], "--warmup=", 9) == 0) {
+            bench_config().warmup_iters = atoi(argv[i] + 9);
+        } else if (strncmp(argv[i], "--iters=", 8) == 0) {
+            bench_config().bench_iters = atoi(argv[i] + 8);
+        } else if (strcmp(argv[i], "-i") == 0) {
+            // Parse "-i N" where the next argument is the value.
+            if (i + 1 < argc) {
+                bench_config().bench_iters = atoi(argv[++i]);
+            } else {
+                fprintf(stderr, "Error: -i requires an argument\n");
+                exit(EXIT_FAILURE);
+            }
+        } else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
+            printf("Benchmark options:\n");
+            printf("  --validate             Enable CPU cross-validation\n");
+            printf("  --skip-warmup          Disable warmup iterations\n");
+            printf("  --warmup=N             Warmup iterations (default: 5)\n");
+            printf("  -i N, --iters=N        Benchmark iterations (default: 20)\n");
+            exit(0);
+        } else if (argv[i][0] == '-') {
+            fprintf(stderr, "Error: unknown option '%s'\n", argv[i]);
+            fprintf(stderr, "Try '--help' for usage information.\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+    
+    // print active configuration
+    if (!bench_config().use_validation) {
+        printf("Note: CPU cross-validation disabled\n");
+    }
+    if (bench_config().warmup_iters == 0) {
+        printf("Note: warmup disabled, iters=%d\n", bench_config().bench_iters);
+    } else if (bench_config().warmup_iters != 5 || bench_config().bench_iters != 20) {
+        printf("Note: warmup=%d, iters=%d\n", 
+               bench_config().warmup_iters, bench_config().bench_iters);
+    }
+}
+
+// CUDA error checking
+#define CHECK_CUDA(call) do { \
+    cudaError_t err = call; \
+    if (err != cudaSuccess) { \
+        fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
+                cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
+    } \
+} while(0)
+
+// device information
+inline void print_device_info() {
+    cudaDeviceProp prop;
+    CHECK_CUDA(cudaGetDeviceProperties(&prop, 0));
+    printf("Device: %s\n", prop.name);
+    
+    int memoryClockRateKHz;
+    cudaError_t status = cudaDeviceGetAttribute(&memoryClockRateKHz, cudaDevAttrMemoryClockRate, 0);
+    if (status == cudaSuccess) {
+        printf("Memory Bandwidth: %.0f GB/s (theoretical peak)\n",
+               2.0 * memoryClockRateKHz * (prop.memoryBusWidth / 8) / 1e6);
+    }
+}
+
+// timing utilities
+class CudaTimer {
+public:
+    CudaTimer() {
+        CHECK_CUDA(cudaEventCreate(&start_));
+        CHECK_CUDA(cudaEventCreate(&stop_));
+    }
+    
+    ~CudaTimer() {
+        cudaEventDestroy(start_);
+        cudaEventDestroy(stop_);
+    }
+    
+    void start() {
+        CHECK_CUDA(cudaEventRecord(start_));
+    }
+    
+    void stop() {
+        CHECK_CUDA(cudaEventRecord(stop_));
+        CHECK_CUDA(cudaEventSynchronize(stop_));
+    }
+    
+    float elapsed_ms() const {
+        float ms;
+        CHECK_CUDA(cudaEventElapsedTime(&ms, start_, stop_));
+        return ms;
+    }
+    
+private:
+    cudaEvent_t start_, stop_;
+};
+
+// Time a kernel launch, returning average time per iteration in milliseconds.
+// Uses global bench_config() for warmup/iteration counts.
+template<typename KernelFunc>
+inline double time_kernel(KernelFunc kernel_launch) {
+    // warmup
+    if (warmup_iters() > 0) {
+        for (int i = 0; i < warmup_iters(); i++) {
+            kernel_launch();
+        }
+        CHECK_CUDA(cudaDeviceSynchronize());
+    }
+    
+    // benchmark
+    CudaTimer timer;
+    timer.start();
+    for (int i = 0; i < bench_iters(); i++) {
+        kernel_launch();
+    }
+    timer.stop();
+    
+    return timer.elapsed_ms() / bench_iters();
+}
+
+// benchmark result structure
+struct BenchmarkResult {
+    const char* name;
+    double time_ms;
+    double bandwidth_gb_s;
+    double gflops;
+    bool correct;
+    
+    BenchmarkResult() : name(nullptr), time_ms(0), bandwidth_gb_s(0), gflops(0), correct(false) {}
+};
+
+// result printing
+inline void print_result(const BenchmarkResult& r) {
+    const char* status = use_validation() ? (r.correct ? "[OK]" : "[FAIL]") : "";
+    if (r.gflops > 0) {
+        printf("  %-42s: %7.3f ms, %7.1f GB/s, %6.1f GFLOPS %s\n",
+               r.name, r.time_ms, r.bandwidth_gb_s, r.gflops, status);
+    } else {
+        printf("  %-42s: %7.3f ms, %7.1f GB/s %s\n",
+               r.name, r.time_ms, r.bandwidth_gb_s, status);
+    }
+}
+
+#endif // CUDA_TILE_BENCHMARK_H
diff --git a/cpp/9_CUDA_Tile/Benchmark_Common/matmul_benchmark.h b/cpp/9_CUDA_Tile/Benchmark_Common/matmul_benchmark.h
new file mode 100644
index 00000000..44bf959d
--- /dev/null
+++ b/cpp/9_CUDA_Tile/Benchmark_Common/matmul_benchmark.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This file provides matrix multiplication benchmark helpers shared by
+ * tileMatmul and tileMatmulAutotuner.
+ */
+
+#ifndef CUDA_TILE_MATMUL_BENCHMARK_H
+#define CUDA_TILE_MATMUL_BENCHMARK_H
+
+#include "benchmark.h"
+
+#include <cuda_fp16.h>
+#include <cmath>
+
+inline void fill_matmul_metrics(BenchmarkResult& result, int M, int N, int K) {
+    // FLOPs: 2 * M * N * K (multiply + add for each output element).
+    double flops = 2.0 * M * N * K;
+    result.gflops = flops / (result.time_ms * 1e6);
+
+    // Bandwidth: read A, read B, and write C.
+    size_t bytes = ((size_t)M * K + (size_t)K * N) * sizeof(__half) +
+                   (size_t)M * N * sizeof(float);
+    result.bandwidth_gb_s = (bytes / 1e9) / (result.time_ms / 1000.0);
+}
+
+// CPU reference implementation (FP16 -> FP32).
+inline void matmul_cpu(float* C, const __half* A, const __half* B, int M, int N, int K) {
+    for (int i = 0; i < M; i++) {
+        for (int j = 0; j < N; j++) {
+            float sum = 0.0f;
+            for (int k = 0; k < K; k++) {
+                sum += __half2float(A[i * K + k]) * __half2float(B[k * N + j]);
+            }
+            C[i * N + j] = sum;
+        }
+    }
+}
+
+inline bool verify_matmul_result(const char* name,
+                                 const float* h_result,
+                                 const float* h_expected,
+                                 int M, int N) {
+    for (int i = 0; i < M * N; i++) {
+        float abs_err = std::abs(h_result[i] - h_expected[i]);
+        float rel_err = abs_err / (std::abs(h_expected[i]) + 1e-6f);
+        // FP16 has less precision, so allow a larger tolerance.
+        if (abs_err > 1e-2f && rel_err > 0.1f) {
+            printf("%s verification failed at %d: got %f, expected %f\n",
+                   name, i, h_result[i], h_expected[i]);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+template<typename KernelFunc, typename ValidateFunc>
+inline BenchmarkResult run_benchmark(const char* name,
+                                     KernelFunc kernel_launch,
+                                     ValidateFunc validate_result,
+                                     int M, int N, int K) {
+    BenchmarkResult result;
+    result.name = name;
+    result.time_ms = time_kernel(kernel_launch);
+    fill_matmul_metrics(result, M, N, K);
+    result.correct = !use_validation() || validate_result();
+    return result;
+}
+
+#endif // CUDA_TILE_MATMUL_BENCHMARK_H
diff --git a/cpp/9_CUDA_Tile/CMakeLists.txt b/cpp/9_CUDA_Tile/CMakeLists.txt
new file mode 100644
index 00000000..99487d60
--- /dev/null
+++ b/cpp/9_CUDA_Tile/CMakeLists.txt
@@ -0,0 +1,17 @@
+# GCC auto-enables _FORTIFY_SOURCE at -O1 and above, which routes printf through
+# a __host__ __device__ wrapper that tile kernels can't call. Turn it off for
+# the tile category.
+add_compile_options(
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-U_FORTIFY_SOURCE>
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_FORTIFY_SOURCE=0>
+)
+
+add_subdirectory(helloTile)
+add_subdirectory(tileVectorAdd)
+add_subdirectory(tileTranspose)
+add_subdirectory(tileMatmulAutotuner)
+add_subdirectory(tileMatmul)
+add_subdirectory(tileBmm)
+add_subdirectory(tileLayerNorm)
+add_subdirectory(tileRope)
+add_subdirectory(tileSpMV)
diff --git a/cpp/9_CUDA_Tile/README.md b/cpp/9_CUDA_Tile/README.md
new file mode 100644
index 00000000..7869389c
--- /dev/null
+++ b/cpp/9_CUDA_Tile/README.md
@@ -0,0 +1,74 @@
+# 9. CUDA Tile
+
+### [helloTile](./helloTile)
+
+A CUDA Tile C++ sample demonstrating basic usage of tile kernels. This sample shows how to launch a tile kernel and how data can be passed between SIMT and Tile kernels through global device memory.
+
+### [tileVectorAdd](./tileVectorAdd)
+
+This sample demonstrates a simple vector addition using CUDA Tile C++.
+The vector addition is performed by splitting the dataset into blocks
+which process 1024 elements at a time. The cuda::tiles::partition_view
+type is used to partition the data into chunks of size 1024. Each
+block loads its respective chunk from 'a' and 'b', performs an
+elementwise addition, then stores it to the corresponding chunk of
+'c'. Masked loads and stores are used to ensure that the last chunk
+which is partially out of bounds is correctly handled.
+
+### [tileTranspose](./tileTranspose)
+
+This sample demonstrates how to transpose a 2D matrix using CUDA Tile
+C++. Each block handles an n x m sized chunk of the source matrix. The
+block loads a chunk, transposes it locally, and stores it to the
+correct position in the result matrix. A cuda::tiles::partition_view
+is used to model the chunking of the source and result matrices.
+
+### [tileMatmul](./tileMatmul)
+
+This sample demonstrates how to write a matrix multiplication kernel with good performance in CUDA Tile C++. The kernel multiplies FP16 input tiles with FP32 accumulation using cuda::tiles::mma. The sample compares a naive implementation with an optimized implementation that applies good practices and provides the compiler with additional guidance for better code generation. The host code validates both results and uses CUDA events to compare execution time.
+
+### [tileMatmulAutotuner](./tileMatmulAutotuner)
+
+A CUDA Tile C++ sample demonstrating an nvrtc/nvcc autotuner over a matrix multiplication kernel. This sample shows how autotuning can help guide the choice of tile sizes and optimization hints.
+
+### [tileBmm](./tileBmm)
+
+This sample demonstrates a static-persistent batched matrix multiplication
+(BMM) using CUDA Tile C++. Given inputs A of shape (Q, M, K) and B of
+shape (Q, K, N), the kernel computes C = A x B of shape (Q, M, N). The
+grid launches a fixed number of persistent blocks sized from the device's
+SM count, and each block walks the (M, N, Q-chunk) tile space via a
+grid-stride loop. Each iteration consumes a chunk of BLOCK_SIZE_Q batches
+and issues a single rank-3 batched cuda::tiles::mma per K-step, with the
+(M, N) output partitioned into BLOCK_SIZE_M x BLOCK_SIZE_N tiles via
+cuda::tiles::partition_view.
+
+### [tileLayerNorm](./tileLayerNorm)
+
+This sample demonstrates a persistent layer-norm forward pass using
+CUDA Tile C++: `y = (x - mean) * rsqrt(var + eps) * weight + bias`.
+The grid launches `NUM_SMS` persistent blocks; each block walks the
+row dimension with a grid-stride loop, processing `BLOCK_N` rows by
+`BLOCK_D` cols per iteration. Per-row mean and inverse standard
+deviation are reduced across the column dimension with `cuda::tiles`
+row reductions, while the weight and bias tiles are loaded once and
+broadcast across rows. Compile-time template parameters for `N`,
+`D`, `NUM_SMS`, and `EPS` let the tile compiler fold the loop step,
+the `(1/D)` reciprocal, partition_view extents, and the eps
+broadcast.
+
+### [tileRope](./tileRope)
+
+This sample demonstrates a Rotary Position Embedding (RoPE) forward
+pass using CUDA Tile C++. The implementation uses the split-half
+(GPT-NeoX style) convention: for each token at position `s` the pair
+`(q[i], q[i + D/2])` is rotated by `theta = s * 10000^(-2i / D)`. The
+`cuda::tiles::partition_view` type partitions the Q and K tensors
+over (heads, half_rope_dim), and a single block processes all heads
+for one (batch, position) token in parallel, writing the result back
+in place against precomputed cos/sin tables.
+
+### [tileSpMV](./tileSpMV)
+
+This sample demonstrates sparse matrix-vector multiplication (SpMV)
+`y = A * x` using CUDA Tile C++.
diff --git a/cpp/9_CUDA_Tile/helloTile/CMakeLists.txt b/cpp/9_CUDA_Tile/helloTile/CMakeLists.txt
new file mode 100644
index 00000000..1406fea7
--- /dev/null
+++ b/cpp/9_CUDA_Tile/helloTile/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(helloTile LANGUAGES C CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_CUDA_ARCHITECTURES 80 86 87 89 90 100 110 120)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --enable-tile")
+
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+add_executable(helloTile helloTile.cu)
+
+target_compile_features(helloTile PRIVATE cxx_std_20 cuda_std_20)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/9_CUDA_Tile/helloTile/README.md b/cpp/9_CUDA_Tile/helloTile/README.md
new file mode 100644
index 00000000..811f090b
--- /dev/null
+++ b/cpp/9_CUDA_Tile/helloTile/README.md
@@ -0,0 +1,31 @@
+# helloTile
+
+## Description
+
+This CUDA Tile C++ sample demonstrates basic usage of tile
+kernels. This code launches a tile kernel using the triple chevron
+syntax and passes data between SIMT and Tile code through global
+device memory.
+
+Error checks are performed using `cudaGetLastError` to catch kernel launch issues and `cudaDeviceSynchronize` to catch kernel execution issues.
+
+## Expected Output
+
+```
+Hello, SIMT!
+[SIMT] *x == 0
+[SIMT] *x = 100
+
+Hello, Tile!
+[Tile] *x == 100
+[Tile] *x = 200
+
+Hello, Host!
+[Host] *x == 200
+```
+
+## Prerequisites
+
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) version 13.3 or later.
+- [CUDA Driver](https://www.nvidia.com/en-us/drivers/) version 580 or later.
+- Host compiler with C++20 support.
diff --git a/cpp/9_CUDA_Tile/helloTile/helloTile.cu b/cpp/9_CUDA_Tile/helloTile/helloTile.cu
new file mode 100644
index 00000000..8d1207be
--- /dev/null
+++ b/cpp/9_CUDA_Tile/helloTile/helloTile.cu
@@ -0,0 +1,76 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This CUDA Tile C++ sample demonstrates basic usage of tile
+ * kernels. This code launches a tile kernel using the triple chevron
+ * syntax and passes data between SIMT and Tile code through global
+ * device memory.  Error checks are performed using `cudaGetLastError`
+ * to catch kernel launch issues and `cudaDeviceSynchronize` to catch
+ * kernel execution issues.
+ */
+
+#include "helper_cuda.h"
+
+__global__ void simtKernel(int* x) {
+  printf("Hello, SIMT!\n");
+  printf("[SIMT] *x == %i\n", *x);
+
+  *x = 100;
+  printf("[SIMT] *x = %i\n\n", *x);
+}
+
+__tile_global__ void tileKernel(int* x) {
+  printf("Hello, Tile!\n");
+  printf("[Tile] *x == %i\n", *x);
+
+  *x = 200;
+  printf("[Tile] *x = %i\n\n", *x);
+}
+
+int main() {
+  int* d_x = nullptr;
+
+  checkCudaErrors(cudaMalloc(&d_x, sizeof(int)));
+  checkCudaErrors(cudaMemset(d_x, 0, sizeof(int)));
+
+  simtKernel<<<1, 1>>>(d_x);
+  checkCudaErrors(cudaGetLastError());
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  /* launches tile kernel, the threads per block parameter is omitted because it must always be 1. */
+  tileKernel<<<1>>>(d_x);
+  checkCudaErrors(cudaGetLastError());
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  int h_x = 0;
+  checkCudaErrors(cudaMemcpy(&h_x, d_x, sizeof(int), cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaFree(d_x));
+
+  printf("Hello, Host!\n");
+  printf("[Host] *x == %i\n", h_x);
+}
diff --git a/cpp/9_CUDA_Tile/tileBmm/CMakeLists.txt b/cpp/9_CUDA_Tile/tileBmm/CMakeLists.txt
new file mode 100644
index 00000000..0bcbbfcd
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileBmm/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(tileBmm LANGUAGES C CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_CUDA_ARCHITECTURES 80 86 87 89 90 100 110 120)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --enable-tile")
+
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+add_executable(tileBmm tileBmm.cu)
+
+target_compile_features(tileBmm PRIVATE cxx_std_20 cuda_std_20)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/9_CUDA_Tile/tileBmm/README.md b/cpp/9_CUDA_Tile/tileBmm/README.md
new file mode 100644
index 00000000..09d30cde
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileBmm/README.md
@@ -0,0 +1,28 @@
+# tileBmm
+
+## Description
+
+This sample demonstrates a static-persistent batched matrix multiplication
+(BMM) using CUDA Tile C++. Given inputs A of shape (Q, M, K) and B of shape
+(Q, K, N), the kernel computes C = A x B of shape (Q, M, N). The grid
+launches a fixed number of persistent blocks sized from the device's SM
+count, and each block walks the (M, N, Q-chunk) tile space via a grid-stride
+loop. The batch dimension is tiled by BLOCK_SIZE_Q so every iteration issues
+a single rank-3 batched cuda::tiles::mma per K-step over tiles of shape
+(BLOCK_SIZE_Q, BLOCK_SIZE_M, BLOCK_SIZE_K) and
+(BLOCK_SIZE_Q, BLOCK_SIZE_K, BLOCK_SIZE_N). Grouped ordering on
+(pid_m, pid_n) gives L2 reuse. The accumulator is kept in float32 for
+precision, and masked loads/stores handle tiles that overhang the matrix
+or batch boundaries. Inputs and outputs use __half precision.
+
+## Expected Output
+
+```
+Success! BMM matches expected results.
+```
+
+## Prerequisites
+
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) version 13.3 or later.
+- [CUDA Driver](https://www.nvidia.com/en-us/drivers/) version 580 or later.
+- Host compiler with C++20 support.
diff --git a/cpp/9_CUDA_Tile/tileBmm/tileBmm.cu b/cpp/9_CUDA_Tile/tileBmm/tileBmm.cu
new file mode 100644
index 00000000..46d1784c
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileBmm/tileBmm.cu
@@ -0,0 +1,268 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample demonstrates a static-persistent batched matrix multiplication
+ * (BMM) using CUDA Tile C++. Given A of shape (Q, M, K) and B of shape
+ * (Q, K, N), the kernel computes C = A x B of shape (Q, M, N). The grid
+ * launches a fixed number of persistent blocks (sized from the device's SM
+ * count); each block walks the (M, N, Q-chunk) tile space via a grid-stride
+ * loop. The batch dimension is tiled by BLOCK_SIZE_Q so every iteration issues
+ * a single rank-3 batched cuda::tiles::mma per K-step over tiles of shape
+ * (BLOCK_SIZE_Q, BLOCK_SIZE_M, BLOCK_SIZE_K) x
+ * (BLOCK_SIZE_Q, BLOCK_SIZE_K, BLOCK_SIZE_N).  Grouped ordering on
+ * (pid_m, pid_n) gives L2 reuse. The accumulator is kept in float32 for
+ * precision, and masked loads/stores handle tiles that overhang the matrix
+ * or batch boundaries. Inputs and outputs use __half.
+ *
+ * A SIMT kernel is used to initialize the input matrices.
+ */
+
+#include "helper_cuda.h"
+
+#include "cuda_tile.h"
+#include "cuda_fp16.h"
+
+#include <cstdio>
+#include <cstdlib>
+
+/* SIMT initializer for A (shape Q x M x K) and B (shape Q x K x N).
+ * Values are bounded so the K-summed result fits comfortably in __half. */
+__global__ void initializeMatrices(__half* a, __half* b,
+                                   int Q, int M, int N, int K) {
+  auto idx = blockIdx.x * blockDim.x + threadIdx.x;
+  std::size_t a_size = std::size_t(Q) * M * K;
+  std::size_t b_size = std::size_t(Q) * K * N;
+
+  if (idx < a_size) {
+    int k = idx % K;
+    int m = (idx / K) % M;
+    a[idx] = __half{float((m + k + 1) % 8) / 32.0f};
+  }
+  if (idx < b_size) {
+    int n = idx % N;
+    int k = (idx / N) % K;
+    b[idx] = __half{float((k + n + 1) % 8) / 32.0f};
+  }
+}
+
+/* Static-persistent tile kernel computing C = A @ B for batched 3D tensors.
+ * A: (Q, M, K), B: (Q, K, N), C: (Q, M, N).  Both inputs are in their
+ * natural (non-transposed) layout.  The grid is sized from the device SM
+ * count and each block walks the (M, N, Q-chunk) tile space via a
+ * grid-stride irange loop.  Each iteration consumes a chunk of BLOCK_SIZE_Q
+ * batches with a single rank-3 batched mma per K-step. */
+template<typename T, int BLOCK_SIZE_Q, int BLOCK_SIZE_M, int BLOCK_SIZE_N,
+         int BLOCK_SIZE_K, int GROUP_SIZE_M, int Q, int M, int N, int K,
+         int NUM_CTAS, int OCCUPANCY>
+[[ using cutile :
+    hint(0, num_cta_in_cga=NUM_CTAS),
+    hint(0, occupancy=OCCUPANCY)
+]]
+__tile_global__ void persistent_bmm_kernel(const T* __restrict__ _a_ptr,
+                                           const T* __restrict__ _b_ptr,
+                                           T* __restrict__ _c_ptr) {
+  namespace ct = cuda::tiles;
+
+  /* tell the compiler the pointers are aligned (important for codegen) */
+  const T* a_ptr = ct::assume_aligned<16>(_a_ptr);
+  const T* b_ptr = ct::assume_aligned<16>(_b_ptr);
+  T* c_ptr = ct::assume_aligned<16>(_c_ptr);
+
+  /* accumulator tile kept in float32 for numerical precision; rank-3
+   * so the batched mma can fold the (q, m, k) x (q, k, n) -> (q, m, n)
+   * contraction in a single call. */
+  using AccTile = ct::tile<float,
+                           ct::shape<BLOCK_SIZE_Q, BLOCK_SIZE_M, BLOCK_SIZE_N>>;
+
+  int bid = ct::bid().x;
+  int num_programs = ct::num_blocks().x;
+
+  /* tile counts include a chunked batch axis */
+  constexpr int num_tiles_m = (M + BLOCK_SIZE_M - 1) / BLOCK_SIZE_M;
+  constexpr int num_tiles_n = (N + BLOCK_SIZE_N - 1) / BLOCK_SIZE_N;
+  constexpr int num_tiles_q = (Q + BLOCK_SIZE_Q - 1) / BLOCK_SIZE_Q;
+  constexpr int total_tiles = num_tiles_m * num_tiles_n * num_tiles_q;
+
+  /* loop-invariant partition views for A (Q, M, K), B (Q, K, N), C (Q, M, N) */
+  auto a_layout = ct::layout_right_mapping{ct::extents{Q, M, K}};
+  auto pA = ct::partition_view{
+      ct::tensor_span{a_ptr, a_layout},
+      ct::shape<BLOCK_SIZE_Q, BLOCK_SIZE_M, BLOCK_SIZE_K>{}};
+  auto b_layout = ct::layout_right_mapping{ct::extents{Q, K, N}};
+  auto pB = ct::partition_view{
+      ct::tensor_span{b_ptr, b_layout},
+      ct::shape<BLOCK_SIZE_Q, BLOCK_SIZE_K, BLOCK_SIZE_N>{}};
+  auto c_layout = ct::layout_right_mapping{ct::extents{Q, M, N}};
+  auto pC = ct::partition_view{
+      ct::tensor_span{c_ptr, c_layout},
+      ct::shape<BLOCK_SIZE_Q, BLOCK_SIZE_M, BLOCK_SIZE_N>{}};
+
+  /* grid-stride loop over (pid_q_chunk, pid_m, pid_n) tiles */
+  for (auto current_bid : ct::irange(bid, total_tiles, num_programs)) {
+    /* decode the linear tile id with grouped ordering on (m, n) for L2 reuse */
+    int pid_q = current_bid / (num_tiles_m * num_tiles_n);
+    int num_pid_in_group = GROUP_SIZE_M * num_tiles_n;
+
+    int current_bid_2d = current_bid % (num_tiles_m * num_tiles_n);
+    int group_id = current_bid_2d / num_pid_in_group;
+    int first_pid_m = group_id * GROUP_SIZE_M;
+    int group_size_m_temp = num_tiles_m - first_pid_m;
+    int group_size_m = (group_size_m_temp < GROUP_SIZE_M)
+                       ? group_size_m_temp : GROUP_SIZE_M;
+    int pid_m = first_pid_m + (current_bid_2d % group_size_m);
+    int pid_n = (current_bid_2d % num_pid_in_group) / group_size_m;
+
+    auto accumulator = ct::zeros<AccTile>();
+
+    /* K-dimension accumulation loop; each iteration issues a single
+     * rank-3 mma across BLOCK_SIZE_Q batches. */
+    constexpr int num_k_tiles = (K + BLOCK_SIZE_K - 1) / BLOCK_SIZE_K;
+    for (auto k_tile : ct::irange(0, num_k_tiles)) {
+      auto a_tile = pA.load_masked(pid_q, pid_m, k_tile);
+      auto b_tile = pB.load_masked(pid_q, k_tile, pid_n);
+      accumulator = ct::mma(a_tile, b_tile, accumulator);
+    }
+
+    auto result = ct::element_cast<T>(accumulator);
+    pC.store_masked(result, pid_q, pid_m, pid_n);
+  }
+}
+
+int main() {
+  /* tile-shape template parameters: multiples of 16 (tensor-core friendly)
+   * that divide the test problem cleanly.  BLOCK_SIZE_Q controls how
+   * many batches each block fuses into a single rank-3 mma.  NUM_CTAS and
+   * OCCUPANCY are launch hints for the cutile compiler.  These values
+   * mirror the production defaults used in the Ocean / TileGym BMM kernel
+   * for sm_100-class GPUs. */
+  constexpr int BLOCK_SIZE_Q = 1;
+  constexpr int BLOCK_SIZE_M = 256;
+  constexpr int BLOCK_SIZE_N = 256;
+  constexpr int BLOCK_SIZE_K = 64;
+  constexpr int GROUP_SIZE_M = 8;
+  constexpr int NUM_CTAS     = 2;
+  constexpr int OCCUPANCY    = 1;
+
+  /* problem dimensions are compile-time NTTPs so partition extents fold and
+   * total_tiles is constexpr inside the kernel.  Sizes are kept small so the
+   * CPU reference comparison stays fast; the launch config above is still
+   * the production sm_100 set (which is tuned for much larger shapes). */
+  constexpr int Q = 4;
+  constexpr int M = 256;
+  constexpr int N = 256;
+  constexpr int K = 128;
+
+  std::size_t a_size = std::size_t(Q) * M * K;
+  std::size_t b_size = std::size_t(Q) * K * N;
+  std::size_t c_size = std::size_t(Q) * M * N;
+
+  __half* d_A = nullptr;
+  __half* d_B = nullptr;
+  __half* d_C = nullptr;
+  checkCudaErrors(cudaMalloc(&d_A, a_size * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_B, b_size * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_C, c_size * sizeof(__half)));
+
+  /* populate A and B with deterministic test data on the device */
+  int init_threads = 256;
+  std::size_t init_elems = (a_size > b_size) ? a_size : b_size;
+  int init_blocks = int((init_elems + init_threads - 1) / init_threads);
+  initializeMatrices<<<init_blocks, init_threads>>>(d_A, d_B, Q, M, N, K);
+  checkCudaErrors(cudaGetLastError());
+
+  /* compute a CPU reference using double accumulation, then cast to __half */
+  __half* h_A = new __half[a_size];
+  __half* h_B = new __half[b_size];
+  __half* h_C_ref = new __half[c_size];
+  checkCudaErrors(cudaMemcpy(h_A, d_A, a_size * sizeof(__half), cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(h_B, d_B, b_size * sizeof(__half), cudaMemcpyDeviceToHost));
+
+  for (int q = 0; q < Q; ++q) {
+    for (int m = 0; m < M; ++m) {
+      for (int n = 0; n < N; ++n) {
+        double acc = 0.0;
+        for (int k = 0; k < K; ++k) {
+          double av = double(float(h_A[(std::size_t(q) * M + m) * K + k]));
+          double bv = double(float(h_B[(std::size_t(q) * K + k) * N + n]));
+          acc += av * bv;
+        }
+        h_C_ref[(std::size_t(q) * M + m) * N + n] = __half{float(acc)};
+      }
+    }
+  }
+
+  /* launch the persistent BMM kernel: grid size mirrors the static-persistent
+   * formula min(NUM_SMS / NUM_CTAS, total_tiles) * OCCUPANCY -- enough
+   * blocks to either saturate the device or cover all tiles, whichever is
+   * smaller. */
+  cudaDeviceProp prop;
+  checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+  int num_sms = prop.multiProcessorCount;
+
+  constexpr int num_tiles_m = (M + BLOCK_SIZE_M - 1) / BLOCK_SIZE_M;
+  constexpr int num_tiles_n = (N + BLOCK_SIZE_N - 1) / BLOCK_SIZE_N;
+  constexpr int num_tiles_q = (Q + BLOCK_SIZE_Q - 1) / BLOCK_SIZE_Q;
+  constexpr int total_tiles = num_tiles_m * num_tiles_n * num_tiles_q;
+
+  int base_programs = num_sms / NUM_CTAS;
+  int grid_size = (base_programs < total_tiles ? base_programs : total_tiles)
+                  * OCCUPANCY;
+
+  persistent_bmm_kernel<__half, BLOCK_SIZE_Q, BLOCK_SIZE_M, BLOCK_SIZE_N,
+                        BLOCK_SIZE_K, GROUP_SIZE_M, Q, M, N, K,
+                        NUM_CTAS, OCCUPANCY>
+      <<<dim3(grid_size, 1, 1)>>>(d_A, d_B, d_C);
+  checkCudaErrors(cudaGetLastError());
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  __half* h_C = new __half[c_size];
+  checkCudaErrors(cudaMemcpy(h_C, d_C, c_size * sizeof(__half), cudaMemcpyDeviceToHost));
+
+  for (std::size_t idx = 0; idx < c_size; ++idx) {
+    float got = float(h_C[idx]);
+    float ref = float(h_C_ref[idx]);
+    float diff = got > ref ? got - ref : ref - got;
+    if (diff > 1e-1f) {
+      printf("Expected: h_C[%zu] == %f\n", idx, ref);
+      printf("Actual:   h_C[%zu] == %f\n", idx, got);
+
+      return 1;
+    }
+  }
+
+  printf("Success! BMM matches expected results.\n");
+
+  checkCudaErrors(cudaFree(d_A));
+  checkCudaErrors(cudaFree(d_B));
+  checkCudaErrors(cudaFree(d_C));
+
+  delete[] h_A;
+  delete[] h_B;
+  delete[] h_C;
+  delete[] h_C_ref;
+}
diff --git a/cpp/9_CUDA_Tile/tileLayerNorm/CMakeLists.txt b/cpp/9_CUDA_Tile/tileLayerNorm/CMakeLists.txt
new file mode 100644
index 00000000..7010fe7a
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileLayerNorm/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(tileLayerNorm LANGUAGES C CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_CUDA_ARCHITECTURES 80 86 87 89 90 100 110 120)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --enable-tile")
+
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+add_executable(tileLayerNorm tileLayerNorm.cu)
+
+target_compile_features(tileLayerNorm PRIVATE cxx_std_20 cuda_std_20)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/9_CUDA_Tile/tileLayerNorm/README.md b/cpp/9_CUDA_Tile/tileLayerNorm/README.md
new file mode 100644
index 00000000..4ca7d5c2
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileLayerNorm/README.md
@@ -0,0 +1,26 @@
+# tileLayerNorm
+
+## Description
+
+This sample demonstrates a persistent layer-norm forward pass using
+CUDA Tile C++:
+`y = (x - mean) * rsqrt(var + eps) * weight + bias`. The grid launches `NUM_SMS`
+persistent blocks; each block walks the row dimension with a grid-stride loop,
+processing `BLOCK_N` rows by `BLOCK_D` cols per iteration and
+striding by `NUM_SMS * BLOCK_N` rows between iterations. Per-row
+mean and inverse standard deviation are reduced across the column
+dimension with `cuda::tiles` row reductions and saved to float32
+side buffers, while the weight and bias tiles are loaded once and
+broadcast across rows.
+
+## Expected Output
+
+```
+Success! Persistent LayerNorm matches expected results.
+```
+
+## Prerequisites
+
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) version 13.3 or later.
+- [CUDA Driver](https://www.nvidia.com/en-us/drivers/) version 580 or later.
+- Host compiler with C++20 support.
diff --git a/cpp/9_CUDA_Tile/tileLayerNorm/tileLayerNorm.cu b/cpp/9_CUDA_Tile/tileLayerNorm/tileLayerNorm.cu
new file mode 100644
index 00000000..4ede3081
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileLayerNorm/tileLayerNorm.cu
@@ -0,0 +1,270 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample demonstrates a persistent LayerNorm forward pass using
+ * CUDA Tile C++:  y = (x - mean) * rsqrt(var + eps) * weight + bias.
+ * The grid launches NUM_SMS persistent blocks; each block walks the
+ * row dimension with a grid-stride loop, processing BLOCK_N rows x
+ * BLOCK_D cols per iteration and striding by NUM_SMS * BLOCK_N rows.
+ * Per-row mean and rstd are reduced over the column dimension and
+ * (when COMPUTE_MEAN_AND_RSTD and TRAINING are enabled) saved to
+ * float32 side buffers. N, D, NUM_SMS, and EPS are template NTTPs so
+ * the tile compiler can fold the loop step, the (1/D) reciprocal,
+ * partition_view extents, and the (var + eps) broadcast.  A SIMT
+ * kernel is used to initialize X, W, and B on device.
+ */
+
+#include "helper_cuda.h"
+#include "cuda_tile.h"
+#include "cuda_fp16.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+/* SIMT initializer for X (N x D), W (D,), B (D,) with deterministic data. */
+__global__ void initializeInputs(__half* X, __half* W, __half* B,
+                                 int N, int D) {
+  auto idx   = blockIdx.x * blockDim.x + threadIdx.x;
+  auto total = N * D;
+  if (idx < total) {
+    int m = idx / D;
+    int n = idx - m * D;
+    X[idx] = __half{float((m + n) % 7) - 3.5f};
+  }
+  if (idx < D) {
+    W[idx] = __half{1.0f + 0.1f * float(idx % 5)};
+    B[idx] = __half{0.1f * float(idx % 3)};
+  }
+}
+
+template<typename T,
+         int BLOCK_N,
+         int BLOCK_D,
+         bool TRAINING,
+         bool COMPUTE_MEAN_AND_RSTD,
+         int N,
+         int D,        // compile-time so `(... / D)` and partition_view extents fold.
+         int NUM_SMS,  // compile-time so the persistent for-loop step is constant.
+         float EPS>    // compile-time so the (var + eps) reshape/broadcast hoists out of the for-loop
+[[ using cutile : hint(0, num_cta_in_cga=1) ]]
+__tile_global__ void persistent_layer_norm_fwd_kernel(
+    const T* __restrict__ X,       // (N, D)
+    T* __restrict__ Y,             // (N, D)
+    const T* __restrict__ W,       // (D,)
+    const T* __restrict__ B,       // (D,)
+    float* __restrict__ Mean,      // (N,)
+    float* __restrict__ Rstd       // (N,)
+) {
+    namespace ct = cuda::tiles;
+
+    using f32_N     = ct::tile<float, ct::shape<BLOCK_N>>;
+
+    X    = ct::assume_aligned<16>(X);
+    Y    = ct::assume_aligned<16>(Y);
+    W    = ct::assume_aligned<16>(W);
+    B    = ct::assume_aligned<16>(B);
+
+    int pid = ct::bid().x;
+    constexpr int upper_bound = (N + BLOCK_N - 1) / BLOCK_N;
+
+    // Partitioned views with compile-time extents (N, D are template NTTPs).
+    using ExtND = ct::extents<uint32_t, static_cast<uint32_t>(N), static_cast<uint32_t>(D)>;
+    using ExtD  = ct::extents<uint32_t, static_cast<uint32_t>(D)>;
+    using ExtN  = ct::extents<uint32_t, static_cast<uint32_t>(N)>;
+    auto pX = ct::partition_view(
+        ct::tensor_span{X, ExtND{}},
+        ct::shape<BLOCK_N, BLOCK_D>{});
+    auto pY = ct::partition_view(
+        ct::tensor_span{Y, ExtND{}},
+        ct::shape<BLOCK_N, BLOCK_D>{});
+    auto pW = ct::partition_view(
+        ct::tensor_span{W, ExtD{}},
+        ct::shape<BLOCK_D>{});
+    auto pB = ct::partition_view(
+        ct::tensor_span{B, ExtD{}},
+        ct::shape<BLOCK_D>{});
+    auto pMean = ct::partition_view(
+        ct::tensor_span{Mean, ExtN{}},
+        ct::shape<BLOCK_N>{});
+    auto pRstd = ct::partition_view(
+        ct::tensor_span{Rstd, ExtN{}},
+        ct::shape<BLOCK_N>{});
+
+    // Load weights once (hoisted out of the grid-stride loop).
+    auto w = ct::element_cast<float>(pW.load(0));  // (BLOCK_D,)
+    auto b = ct::element_cast<float>(pB.load(0));  // (BLOCK_D,)
+
+    // Broadcast weights into (BLOCK_N, BLOCK_D) by reshape to (1, BLOCK_D).
+    auto w_bcast = ct::reshape<ct::shape<1, BLOCK_D>>(w);
+    auto b_bcast = ct::reshape<ct::shape<1, BLOCK_D>>(b);
+
+    constexpr float inv_D_scalar = 1.0f / static_cast<float>(D);
+    auto inv_D_tile = ct::full<f32_N>(inv_D_scalar);
+    auto eps_tile   = ct::full<f32_N>(EPS);
+
+    using TileXNxD = ct::tile<T, ct::shape<BLOCK_N, BLOCK_D>>;
+
+    for (auto current_pid : ct::irange(pid, upper_bound, NUM_SMS)) {
+        TileXNxD x_tile;
+        [[ using cutile : hint(0, latency=4) ]]
+        x_tile = pX.load_masked(current_pid, 0);
+        auto x = ct::element_cast<float>(x_tile);
+
+        f32_N mean;
+        f32_N rstd;
+
+        if constexpr (COMPUTE_MEAN_AND_RSTD) {
+            // Step 1: Compute x^2 then sum/mean.  Use the loop-invariant
+            // `inv_D_tile` and `eps_tile` (built outside the loop) so the
+            // reshape + broadcast of those scalars stays hoisted.
+            auto x_squared = x * x;
+            auto avg_square_2d = ct::sum<1>(x_squared);
+            auto avg_square = ct::reshape<ct::shape<BLOCK_N>>(avg_square_2d) * inv_D_tile;
+            auto mean_2d = ct::sum<1>(x);
+            mean = ct::reshape<ct::shape<BLOCK_N>>(mean_2d) * inv_D_tile;
+            auto var = avg_square - mean * mean;
+
+            rstd = ct::rsqrt(var + eps_tile);
+
+            if constexpr (TRAINING) {
+                [[ using cutile : hint(0, allow_tma=false) ]]
+                pMean.store_masked(mean, current_pid);
+                [[ using cutile : hint(0, allow_tma=false) ]]
+                pRstd.store_masked(rstd, current_pid);
+            }
+        } else {
+            mean = pMean.load_masked(current_pid);
+            rstd = pRstd.load_masked(current_pid);
+        }
+
+        // Broadcast mean/rstd to (BLOCK_N, 1) then rely on implicit broadcast
+        // against (BLOCK_N, BLOCK_D).
+        auto mean_col = ct::reshape<ct::shape<BLOCK_N, 1>>(mean);
+        auto rstd_col = ct::reshape<ct::shape<BLOCK_N, 1>>(rstd);
+
+        auto x_hat = (x - mean_col) * rstd_col;
+        auto y_f32 = x_hat * w_bcast + b_bcast;
+
+        auto y_T = ct::element_cast<T>(y_f32);
+        [[ using cutile : hint(0, allow_tma=false) ]]
+        pY.store_masked(y_T, current_pid, 0);
+    }
+}
+
+int main() {
+  /* BLOCK_D == D so each persistent-loop iteration covers a full row's
+   * columns in one tile per row. NUM_SMS is a template NTTP, hence
+   * compile-time; 132 matches B200 / Hopper-class GPUs - adjust to
+   * match the target device's `multiProcessorCount` for best perf. */
+  constexpr int   N = 1024, D = 256;
+  constexpr int   BLOCK_N = 4, BLOCK_D = 256;
+  constexpr int   NUM_SMS = 132;
+  constexpr float EPS     = 1e-5f;
+
+  __half *d_X = nullptr, *d_Y = nullptr, *d_W = nullptr, *d_B = nullptr;
+  float  *d_Mean = nullptr, *d_Rstd = nullptr;
+  checkCudaErrors(cudaMalloc(&d_X,    N * D * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_Y,    N * D * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_W,    D     * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_B,    D     * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_Mean, N     * sizeof(float)));
+  checkCudaErrors(cudaMalloc(&d_Rstd, N     * sizeof(float)));
+
+  int init_threads = 256, init_blocks = 1 + ((N * D - 1) / init_threads);
+  initializeInputs<<<init_blocks, init_threads>>>(d_X, d_W, d_B, N, D);
+  checkCudaErrors(cudaGetLastError());
+
+  /* NUM_SMS is a compile-time NTTP that doubles as the persistent-loop
+   * stride; the launch grid x must equal NUM_SMS for correctness.
+   * Adjust the constant (and recompile) for one block per SM on a
+   * device with a different SM count. */
+  persistent_layer_norm_fwd_kernel<__half, BLOCK_N, BLOCK_D,
+                                   /*TRAINING=*/true, /*COMPUTE_MEAN_AND_RSTD=*/true,
+                                   N, D, NUM_SMS, EPS>
+      <<<dim3(NUM_SMS, 1, 1)>>>(d_X, d_Y, d_W, d_B, d_Mean, d_Rstd);
+  checkCudaErrors(cudaGetLastError());
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  __half* h_Y        = new __half[N * D];
+  __half* h_Y_ref    = new __half[N * D];
+  float*  h_Mean     = new float[N];
+  float*  h_Rstd     = new float[N];
+  float*  h_Mean_ref = new float[N];
+  float*  h_Rstd_ref = new float[N];
+  checkCudaErrors(cudaMemcpy(h_Y,    d_Y,    N * D * sizeof(__half), cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(h_Mean, d_Mean, N     * sizeof(float),  cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(h_Rstd, d_Rstd, N     * sizeof(float),  cudaMemcpyDeviceToHost));
+
+  /* CPU reference in double precision; compare with 1e-1 fp16 tolerance
+   * for Y and 1e-3 for the float32 Mean/Rstd outputs. */
+  for (int m = 0; m < N; ++m) {
+    double sum = 0.0, sumsq = 0.0;
+    for (int n = 0; n < D; ++n) {
+      double x = double(float((m + n) % 7) - 3.5f);
+      sum += x; sumsq += x * x;
+    }
+    double mu      = sum / double(D);
+    double var     = sumsq / double(D) - mu * mu;
+    double inv_std = 1.0 / std::sqrt(var + double(EPS));
+    h_Mean_ref[m]  = float(mu);
+    h_Rstd_ref[m]  = float(inv_std);
+    for (int n = 0; n < D; ++n) {
+      double x = double(float((m + n) % 7) - 3.5f);
+      double w = double(1.0f + 0.1f * float(n % 5));
+      double b = double(0.1f * float(n % 3));
+      h_Y_ref[m * D + n] = __half(float((x - mu) * inv_std * w + b));
+    }
+  }
+
+  for (int idx = 0; idx < N * D; ++idx) {
+    float expected = float(h_Y_ref[idx]), actual = float(h_Y[idx]);
+    if (std::fabs(expected - actual) > 1e-1f) {
+      printf("Mismatch h_Y[%d]: expected %f, actual %f\n", idx, expected, actual);
+      return 1;
+    }
+  }
+  for (int m = 0; m < N; ++m) {
+    if (std::fabs(h_Mean_ref[m] - h_Mean[m]) > 1e-3f) {
+      printf("Mismatch h_Mean[%d]: expected %f, actual %f\n", m, h_Mean_ref[m], h_Mean[m]);
+      return 1;
+    }
+    if (std::fabs(h_Rstd_ref[m] - h_Rstd[m]) > 1e-3f) {
+      printf("Mismatch h_Rstd[%d]: expected %f, actual %f\n", m, h_Rstd_ref[m], h_Rstd[m]);
+      return 1;
+    }
+  }
+
+  printf("Success! Persistent LayerNorm matches expected results.\n");
+
+  checkCudaErrors(cudaFree(d_X));   checkCudaErrors(cudaFree(d_Y));
+  checkCudaErrors(cudaFree(d_W));   checkCudaErrors(cudaFree(d_B));
+  checkCudaErrors(cudaFree(d_Mean)); checkCudaErrors(cudaFree(d_Rstd));
+  delete[] h_Y;        delete[] h_Y_ref;
+  delete[] h_Mean;     delete[] h_Rstd;
+  delete[] h_Mean_ref; delete[] h_Rstd_ref;
+}
diff --git a/cpp/9_CUDA_Tile/tileMatmul/CMakeLists.txt b/cpp/9_CUDA_Tile/tileMatmul/CMakeLists.txt
new file mode 100644
index 00000000..bb496cf8
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileMatmul/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(tileMatmul LANGUAGES C CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_CUDA_ARCHITECTURES 80 86 87 89 90 100 110 120)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --enable-tile")
+
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common ../Benchmark_Common)
+
+# Source file
+add_executable(tileMatmul tileMatmul.cu)
+
+target_compile_features(tileMatmul PRIVATE cxx_std_20 cuda_std_20)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/9_CUDA_Tile/tileMatmul/README.md b/cpp/9_CUDA_Tile/tileMatmul/README.md
new file mode 100644
index 00000000..48cd3f20
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileMatmul/README.md
@@ -0,0 +1,55 @@
+# tileMatmul
+
+## Description
+
+This sample demonstrates how to write a matrix multiplication kernel with good
+performance in CUDA Tile C++. The kernel multiplies FP16 input tiles with FP32
+accumulation using `cuda::tiles::mma`.
+
+The sample compares a naive implementation with an optimized implementation
+that applies good practices and provides the compiler with additional guidance
+for better code generation. The host code uses CUDA events to capture execution time.
+
+The optimized kernel uses neutral placeholder values for `LOAD_LATENCY` and
+`STORE_LATENCY`. See [tileMatmulAutotuner](../tileMatmulAutotuner) for an
+example of autotuning these values to find a suitable
+configuration for your hardware.
+
+## Running
+
+```
+# Run with default warmup and benchmark iterations. Validation is disabled by default.
+./tileMatmul
+
+# Enable CPU validation
+./tileMatmul --validate
+
+# To run faster, skip warmups (not recommended) and set iteration to 1
+./tileMatmul --skip-warmup -i 1
+
+# Show all options
+./tileMatmul --help
+```
+
+## Command-Line Options
+
+| Option | Description |
+|--------|-------------|
+| `--validate` | Enable CPU cross-validation. Validation is disabled by default. |
+| `--skip-warmup` | Disable warmup iterations. |
+| `--warmup=N` | Set warmup iterations. The default is 5. |
+| `-i N`, `--iters=N` | Set benchmark iterations. The default is 20. |
+
+## Example Output
+
+```
+Note: CPU cross-validation disabled
+  matmul                                    :   0.073 ms,   115.2 GB/s, 29495.8 GFLOPS 
+  matmul_naive                              :   0.497 ms,    16.9 GB/s, 4322.8 GFLOPS
+```
+
+## Prerequisites
+
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) version 13.3 or later.
+- [CUDA Driver](https://www.nvidia.com/en-us/drivers/) version 580 or later.
+- Host compiler with C++20 support.
diff --git a/cpp/9_CUDA_Tile/tileMatmul/tileMatmul.cu b/cpp/9_CUDA_Tile/tileMatmul/tileMatmul.cu
new file mode 100644
index 00000000..b025a3cb
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileMatmul/tileMatmul.cu
@@ -0,0 +1,282 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample demonstrates matrix multiplication using CUDA Tile C++.
+ * The input matrices are split into tiles. Each Tile block computes one
+ * output tile by loading FP16 tiles from A and B, accumulating with
+ * cuda::tiles::mma in FP32, and storing an FP32 result tile.
+ *
+ * The sample compares a naive implementation with an optimized implementation.
+ * The optimized kernel adds compiler guidance such as pointer alignment and
+ * divisibility assumptions, cuda::tiles::irange, and latency hints. The host
+ * code validates both results against a CPU reference and uses CUDA events to
+ * compare execution time.
+ */
+
+#include "helper_cuda.h"
+#include "matmul_benchmark.h"
+
+#include "cuda_tile.h"
+#include "cuda_fp16.h"
+
+#include <cstdlib>
+#include <cstdio>
+#include <vector>
+
+constexpr int TILE_BLOCK_M = 32;
+constexpr int TILE_BLOCK_N = 64;
+constexpr int TILE_BLOCK_K = 64;
+constexpr int LOAD_LATENCY = 5;
+constexpr int STORE_LATENCY = 5;
+
+/*
+ * Baseline Tile C++ matmul. This keeps the same tensor_span, partition_view,
+ * and ct::mma structure as the optimized kernel, but avoids optimization
+ * hints, assumptions, Tile-specific loop guidance, and the divisibility
+ * precondition used by non-masked view loads and stores.
+ */
+__tile_global__ void matmul_naive(float* C,
+                                  const __half* A,
+                                  const __half* B,
+                                  int M, int N, int K) {
+
+    namespace ct = cuda::tiles;
+
+    // create tensor spans with runtime shapes (FP16 for A and B)
+    auto a_span = ct::tensor_span{A, ct::extents{M, K}};
+    auto b_span = ct::tensor_span{B, ct::extents{K, N}};
+    auto c_span = ct::tensor_span{C, ct::extents{M, N}};
+    
+    // create partition views with compile-time tile sizes
+    auto a_view = ct::partition_view{a_span, ct::shape<TILE_BLOCK_M, TILE_BLOCK_K>{}};
+    auto b_view = ct::partition_view{b_span, ct::shape<TILE_BLOCK_K, TILE_BLOCK_N>{}};
+    auto c_view = ct::partition_view{c_span, ct::shape<TILE_BLOCK_M, TILE_BLOCK_N>{}};
+    
+    // get block indices from the 2D grid
+    auto [pid_m, pid_n, dummy] = ct::bid();
+    
+    // initialize FP32 accumulator
+    auto acc = ct::zeros<ct::tile<float, ct::shape<TILE_BLOCK_M, TILE_BLOCK_N>>>();
+    
+    // loop over the K dimension in blocks
+    int num_k_blocks = (K + TILE_BLOCK_K - 1) / TILE_BLOCK_K;
+    for (int k_block = 0; k_block < num_k_blocks; ++k_block) {
+        ct::tile<__half, ct::shape<TILE_BLOCK_M, TILE_BLOCK_K>> a_tile;
+        ct::tile<__half, ct::shape<TILE_BLOCK_K, TILE_BLOCK_N>> b_tile;
+        
+        // load blocks of A and B (FP16), using the default zero padding for boundary tiles
+        a_tile = a_view.load_masked(pid_m, k_block);
+        b_tile = b_view.load_masked(k_block, pid_n);
+        
+        // accumulate: acc += A_block @ B_block (FP16 inputs, FP32 accumulator)
+        // ct::mma handles mixed precision: FP16 operands with FP32 accumulator.
+        acc = ct::mma(a_tile, b_tile, acc);
+    }
+    
+    // store result (FP32), suppressing stores outside the output matrix
+    c_view.store_masked(acc, pid_m, pid_n);
+}
+
+/*
+ * optimization strategy: declare non-aliasing pointers with __restrict__.
+ * This gives the compiler more freedom to schedule loads and stores for A,
+ * B, and C because the output tile cannot overlap the input tiles.
+ */
+__tile_global__ void matmul(float* __restrict__ _C,
+                            const __half* __restrict__ _A,
+                            const __half* __restrict__ _B,
+                            int _M, int _N, int _K) {
+    namespace ct = cuda::tiles;
+    using namespace ct::literals;
+
+    /*
+     * optimization strategy: communicate pointer alignment explicitly.
+     * Assumptions are optimization facts, not runtime checks; the returned
+     * value must be used, and violating the assumption is undefined behavior.
+     */
+    float* C = ct::assume_aligned(_C, 16_ic);
+    const __half* A = ct::assume_aligned(_A, 16_ic);
+    const __half* B = ct::assume_aligned(_B, 16_ic);
+
+    /*
+     * optimization strategy: communicate divisible problem dimensions. This
+     * gives the compiler static facts about index arithmetic and avoids extra
+     * boundary handling in this optimized divisible-size path.
+     */
+    auto M = ct::assume_divisible(_M, 16_ic);
+    auto N = ct::assume_divisible(_N, 16_ic);
+    auto K = ct::assume_divisible(_K, 16_ic);
+
+    auto a_span = ct::tensor_span{A, ct::extents{M, K}};
+    auto b_span = ct::tensor_span{B, ct::extents{K, N}};
+    auto c_span = ct::tensor_span{C, ct::extents{M, N}};
+
+    auto a_view = ct::partition_view{a_span, ct::shape<TILE_BLOCK_M, TILE_BLOCK_K>{}};
+    auto b_view = ct::partition_view{b_span, ct::shape<TILE_BLOCK_K, TILE_BLOCK_N>{}};
+    auto c_view = ct::partition_view{c_span, ct::shape<TILE_BLOCK_M, TILE_BLOCK_N>{}};
+
+    auto [pid_m, pid_n, dummy] = ct::bid();
+
+    auto acc = ct::zeros<ct::tile<float, ct::shape<TILE_BLOCK_M, TILE_BLOCK_N>>>();
+    
+    /*
+     * optimization strategy: use ct::irange for the K traversal. This gives the
+     * compiler a Tile-aware structured loop form for iterating through the
+     * reduction dimension, unlike the plain C++ loop used in matmul_naive.
+     */
+    int num_k_blocks = (K + TILE_BLOCK_K - 1) / TILE_BLOCK_K;
+    for (auto k_block : ct::irange(0, num_k_blocks)) {
+        ct::tile<__half, ct::shape<TILE_BLOCK_M, TILE_BLOCK_K>> a_tile;
+        ct::tile<__half, ct::shape<TILE_BLOCK_K, TILE_BLOCK_N>> b_tile;
+        
+        /*
+         * optimization strategy: attach a latency hint to the A tile load.
+         * Tile optimization hints can appertain to expression statements and
+         * influence scheduling decisions for that construct. The non-masked
+         * load also lets the compiler assume every element in this tile is
+         * in bounds; the host only launches this kernel for divisible sizes.
+         */
+        [[
+            cutile::hint(0, latency=LOAD_LATENCY),
+        ]]
+        a_tile = a_view.load(pid_m, k_block);
+
+        /*
+         * optimization strategy: attach the same latency hint to the B tile
+         * load so the compiler can schedule both input streams with the
+         * expected memory latency in mind. As above, this relies on the
+         * optimized kernel's divisibility precondition.
+         */
+        [[
+            cutile::hint(0, latency=LOAD_LATENCY),
+        ]] 
+        b_tile = b_view.load(k_block, pid_n);
+
+        acc = ct::mma(a_tile, b_tile, acc);
+    }
+    
+    /*
+     * optimization strategy: attach a store latency hint to the final C tile
+     * store. This mirrors the load hints and gives the compiler information
+     * about the expected memory latency of the output path. The non-masked
+     * store also avoids boundary masking under the same divisibility
+     * precondition.
+     */
+    [[
+        cutile::hint(0, latency=STORE_LATENCY),
+    ]]
+    c_view.store(acc, pid_m, pid_n);
+}
+
+void run_with_size(int M, int N, int K) {
+  if (M <= 0 || N <= 0 || K <= 0) {
+    std::fprintf(stderr,
+                 "M, N, and K must be positive (got M=%d, N=%d, K=%d).\n",
+                 M, N, K);
+    std::exit(EXIT_FAILURE);
+  }
+
+  // allocate and initialize FP16 inputs on the host
+  std::vector<__half> h_A(M * K), h_B(K * N);
+  
+  srand(42);
+  for (int i = 0; i < M * K; i++) h_A[i] = __float2half((float)rand() / RAND_MAX - 0.5f);
+  for (int i = 0; i < K * N; i++) h_B[i] = __float2half((float)rand() / RAND_MAX - 0.5f);
+  std::vector<float> h_expected;
+  if (use_validation()) {
+    // build the expected result once, then reuse it for both device kernels
+    h_expected.resize(M * N);
+    matmul_cpu(h_expected.data(), h_A.data(), h_B.data(), M, N, K);
+  }
+
+  __half *d_A, *d_B;
+  float *d_C;
+  checkCudaErrors(cudaMalloc(&d_A, M * K * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_B, K * N * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_C, M * N * sizeof(float)));
+  checkCudaErrors(cudaMemcpy(d_A, h_A.data(), M * K * sizeof(__half), cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(d_B, h_B.data(), K * N * sizeof(__half), cudaMemcpyHostToDevice));
+
+  bool passed = true;
+  // both kernels run only when the matmul()'s precondition is met
+  if (M % TILE_BLOCK_M == 0 && N % TILE_BLOCK_N == 0 && K % TILE_BLOCK_K == 0) {
+    dim3 grid(M / TILE_BLOCK_M, N / TILE_BLOCK_N);
+    auto run_kernel = [&](const char* name, auto kernel_launch) {
+      // clear C before each benchmarked kernel for independent validation
+      checkCudaErrors(cudaMemset(d_C, 0, M * N * sizeof(float)));
+
+      BenchmarkResult result = run_benchmark(name,
+          [&]() {
+            kernel_launch();
+            checkCudaErrors(cudaGetLastError());
+          },
+          [&]() {
+            std::vector<float> h_C(M * N);
+            checkCudaErrors(cudaMemcpy(h_C.data(), d_C, M * N * sizeof(float),
+                                       cudaMemcpyDeviceToHost));
+
+            return verify_matmul_result(name, h_C.data(), h_expected.data(), M, N);
+          },
+          M, N, K);
+
+      print_result(result);
+      return result.correct;
+    };
+
+    // run and validate the optimized kernel
+    passed &= run_kernel("matmul", [&]() {
+      matmul<<<grid, 1>>>(d_C, d_A, d_B, M, N, K);
+    });
+
+    // run and validate the baseline kernel
+    passed &= run_kernel("matmul_naive", [&]() {
+      matmul_naive<<<grid, 1>>>(d_C, d_A, d_B, M, N, K);
+    });
+
+  } else {
+    std::fprintf(stderr,
+                 "Skipping M=%d, N=%d, K=%d as the optimized kernel assumes "
+                 "dimensions divisible by TILE_BLOCK_M=%d, TILE_BLOCK_N=%d, "
+                 "TILE_BLOCK_K=%d.\n",
+                 M, N, K, TILE_BLOCK_M, TILE_BLOCK_N, TILE_BLOCK_K);
+    passed = false;
+  }
+
+  checkCudaErrors(cudaFree(d_A));
+  checkCudaErrors(cudaFree(d_B));
+  checkCudaErrors(cudaFree(d_C));
+
+  if (!passed) {
+    std::exit(EXIT_FAILURE);
+  }
+}
+
+int main(int argc, char** argv) {
+  parse_benchmark_args(argc, argv);
+  run_with_size(1024, 1024, 1024);
+}
diff --git a/cpp/9_CUDA_Tile/tileMatmulAutotuner/CMakeLists.txt b/cpp/9_CUDA_Tile/tileMatmulAutotuner/CMakeLists.txt
new file mode 100644
index 00000000..4ed4807c
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileMatmulAutotuner/CMakeLists.txt
@@ -0,0 +1,67 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(tileMatmulAutotuner LANGUAGES C CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+get_filename_component(CUDA_TOOLKIT_BIN_DIR "${CMAKE_CUDA_COMPILER}" DIRECTORY)
+find_program(TILEIRAS_EXECUTABLE tileiras
+    HINTS "${CUDA_TOOLKIT_BIN_DIR}"
+    REQUIRED
+)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_CUDA_ARCHITECTURES 80 86 87 89 90 100 110 120)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --enable-tile")
+
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common ../Benchmark_Common)
+
+# Source files
+add_executable(tileMatmulAutotuner
+    matmul_autotuner.cpp
+    autotuner_search_space.conf
+)
+
+target_compile_features(tileMatmulAutotuner PRIVATE cxx_std_20 cuda_std_20)
+
+target_include_directories(tileMatmulAutotuner PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+
+list(GET CUDAToolkit_INCLUDE_DIRS 0 CUDA_INCLUDE_DIR)
+file(TO_CMAKE_PATH "${CUDA_INCLUDE_DIR}" CUDA_INCLUDE_DIR_FOR_DEFINE)
+file(TO_CMAKE_PATH "${TILEIRAS_EXECUTABLE}" TILEIRAS_EXECUTABLE_FOR_DEFINE)
+file(TO_CMAKE_PATH "${CMAKE_CUDA_COMPILER}" NVCC_EXECUTABLE_FOR_DEFINE)
+
+target_compile_definitions(tileMatmulAutotuner PRIVATE
+    CUDA_INCLUDE_PATH="${CUDA_INCLUDE_DIR_FOR_DEFINE}"
+    NVCC_PATH="${NVCC_EXECUTABLE_FOR_DEFINE}"
+    TILEIRAS_PATH="${TILEIRAS_EXECUTABLE_FOR_DEFINE}"
+)
+
+target_link_libraries(tileMatmulAutotuner PRIVATE
+    CUDA::cuda_driver
+    CUDA::cudart
+    CUDA::nvrtc
+)
+
+add_custom_command(TARGET tileMatmulAutotuner POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+    ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cu
+    ${CMAKE_CURRENT_BINARY_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+    ${CMAKE_CURRENT_SOURCE_DIR}/autotuner_search_space.conf
+    ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/9_CUDA_Tile/tileMatmulAutotuner/README.md b/cpp/9_CUDA_Tile/tileMatmulAutotuner/README.md
new file mode 100644
index 00000000..cdf8f549
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileMatmulAutotuner/README.md
@@ -0,0 +1,59 @@
+# tileMatmulAutotuner
+
+## Description
+
+A CUDA Tile C++ sample demonstrating how to autotune tile sizes and optimization hints for matrix multiplication with FP16 inputs and FP32 accumulation when compiling with nvrtc or nvcc.
+
+The sample explores combinations of tile sizes and optimization hints, compiles the Tile C++ matrix multiplication kernel, executes it, and reports the best measured configuration. The launch grid is derived from the selected tile size and matrix dimensions.
+The search space is read from `autotuner_search_space.conf` so one can edit tile sizes and hint values without rebuilding the sample.
+
+## Running
+
+```
+# Run autotuner. Validation is disabled by default.
+./tileMatmulAutotuner
+
+# Select a backend
+./tileMatmulAutotuner --backend=nvrtc
+./tileMatmulAutotuner --backend=nvcc
+
+# Enable CPU validation
+./tileMatmulAutotuner --validate
+
+# To run faster, skip warmups and set iteration to 1
+./tileMatmulAutotuner --skip-warmup -i 1
+
+# Show all options
+./tileMatmulAutotuner --help
+```
+
+NOTE: When using the nvrtc backend, the libnvrtc library must be on the dynamic library search path. This may require adding the CUDA toolkit 'lib' directory to the LD_LIBRARY_PATH (for linux) or PATH (for windows) env var.
+
+## Command-Line Options
+
+| Option | Description |
+|--------|-------------|
+| `--validate` | Enable CPU cross-validation. Validation is disabled by default. |
+| `--skip-warmup` | Disable warmup iterations. |
+| `--warmup=N` | Set warmup iterations. The default is 5. |
+| `-i N`, `--iters=N` | Set benchmark iterations. The default is 20. |
+| `--backend=nvrtc\|nvcc` | Select the backend. The default is `nvrtc`. |
+
+## Search Space Configuration
+
+Each `tile` line adds one `TILE_BLOCK_M`, `TILE_BLOCK_N`, and `TILE_BLOCK_K` combination. The `load_latency` and `store_latency` lines list values that are combined with every `tile` entry.
+
+```
+tile 64 64 32
+tile 128 64 32
+load_latency 2 5 8
+store_latency 2 5 8
+```
+
+Lines beginning with `#` are comments.
+
+## Prerequisites
+
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) version 13.3 or later.
+- [CUDA Driver](https://www.nvidia.com/en-us/drivers/) version 580 or later. The NVRTC backend invokes `tileiras` to compile Tile IR to cubin. JIT-compiling Tile IR to cubin with the CUDA Driver API instead requires version 590 or later.
+- Host compiler with C++20 support.
diff --git a/cpp/9_CUDA_Tile/tileMatmulAutotuner/autotuner_search_space.conf b/cpp/9_CUDA_Tile/tileMatmulAutotuner/autotuner_search_space.conf
new file mode 100644
index 00000000..af852666
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileMatmulAutotuner/autotuner_search_space.conf
@@ -0,0 +1,15 @@
+# tileMatmulAutotuner search space
+#
+# The autotuner tries every tile entry with every load_latency and
+# store_latency value listed below. Edit this file to experiment without
+# rebuilding the sample.
+
+# tile block_m block_n block_k
+tile 64 64 32
+tile 128 64 32
+tile 64 128 32
+tile 128 128 32
+
+# latency hint values
+load_latency 2 5 8
+store_latency 2 5 8
diff --git a/cpp/9_CUDA_Tile/tileMatmulAutotuner/backend_common.h b/cpp/9_CUDA_Tile/tileMatmulAutotuner/backend_common.h
new file mode 100644
index 00000000..5a33f52a
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileMatmulAutotuner/backend_common.h
@@ -0,0 +1,314 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <cerrno>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <helper_string.h>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#include <process.h>
+#else
+#include <unistd.h>
+#endif
+
+enum class CompilerBackend {
+    NVRTC,
+    NVCC
+};
+
+struct CompiledKernel {
+    std::vector<char> image;
+};
+
+struct TileConfig {
+    int block_m;
+    int block_n;
+    int block_k;
+};
+
+struct SearchSpace {
+    std::vector<TileConfig> tile_options;
+    std::vector<int> load_latency_options;
+    std::vector<int> store_latency_options;
+};
+
+static constexpr const char *kSearchSpaceFileName = "autotuner_search_space.conf";
+
+inline const char* compilerBackendName(CompilerBackend compiler_backend) {
+    return compiler_backend == CompilerBackend::NVCC ? "NVCC" : "NVRTC";
+}
+
+inline int ceilDiv(int a, int b) {
+    return (a + b - 1) / b;
+}
+
+inline unsigned long getProcessId() {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    return static_cast<unsigned long>(_getpid());
+#else
+    return static_cast<unsigned long>(getpid());
+#endif
+}
+
+inline std::string makeTempPath(const char *prefix, const char *suffix) {
+    static unsigned int counter = 0;
+    std::string filename = std::string(prefix) + "_" + std::to_string(getProcessId()) + "_" +
+                           std::to_string(static_cast<long>(time(NULL))) + "_" +
+                           std::to_string(counter++) + suffix;
+    return (std::filesystem::temp_directory_path() / filename).string();
+}
+
+inline std::string shellQuote(const std::string& arg) {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    std::string quoted = "\"";
+    for (char c : arg) {
+        if (c == '"') {
+            quoted += "\\\"";
+        } else {
+            quoted += c;
+        }
+    }
+    quoted += "\"";
+    return quoted;
+#else
+    std::string quoted = "'";
+    for (char c : arg) {
+        if (c == '\'') {
+            quoted += "'\\''";
+        } else {
+            quoted += c;
+        }
+    }
+    quoted += "'";
+    return quoted;
+#endif
+}
+
+inline std::string joinShellCommand(const std::vector<std::string>& args) {
+    std::string cmd;
+    for (const auto& arg : args) {
+        if (!cmd.empty()) {
+            cmd += " ";
+        }
+        cmd += shellQuote(arg);
+    }
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    return "\"" + cmd + "\"";
+#else
+    return cmd;
+#endif
+}
+
+inline std::vector<char> readBinaryFile(const std::string& path) {
+    std::ifstream file(path, std::ios::in | std::ios::binary | std::ios::ate);
+    if (!file.is_open()) {
+        std::cerr << "\nerror: unable to open " << path << " for reading!\n";
+        exit(EXIT_FAILURE);
+    }
+
+    std::streamsize size = file.tellg();
+    if (size < 0) {
+        std::cerr << "\nerror: unable to determine size of " << path << "\n";
+        exit(EXIT_FAILURE);
+    }
+
+    std::vector<char> data(static_cast<size_t>(size));
+    file.seekg(0, std::ios::beg);
+    if (size > 0 && !file.read(data.data(), size)) {
+        std::cerr << "\nerror: unable to read " << path << "\n";
+        exit(EXIT_FAILURE);
+    }
+    return data;
+}
+
+inline std::string baseNameWithoutExtension(const std::string& path) {
+    size_t slash = path.find_last_of("/\\");
+    std::string base = (slash == std::string::npos) ? path : path.substr(slash + 1);
+    size_t dot = base.find_last_of('.');
+    if (dot != std::string::npos) {
+        base.resize(dot);
+    }
+    return base;
+}
+
+inline void appendTileBlockMacroOptions(std::vector<std::string>& options,
+                                        int block_m, int block_n, int block_k) {
+    options.push_back("-DTILE_BLOCK_M=" + std::to_string(block_m));
+    options.push_back("-DTILE_BLOCK_N=" + std::to_string(block_n));
+    options.push_back("-DTILE_BLOCK_K=" + std::to_string(block_k));
+}
+
+inline bool parsePositiveInt(const std::string& text, int *value) {
+    char *end = nullptr;
+    errno = 0;
+    long parsed = std::strtol(text.c_str(), &end, 10);
+    if (errno != 0 || end == text.c_str() || *end != '\0' ||
+        parsed <= 0 || parsed > INT_MAX) {
+        return false;
+    }
+    *value = static_cast<int>(parsed);
+    return true;
+}
+
+inline void searchSpaceError(const char *filename,
+                                          int line_number,
+                                          const std::string& message) {
+    fprintf(stderr, "Error: %s:%d: %s\n", filename, line_number, message.c_str());
+    exit(EXIT_FAILURE);
+}
+
+inline char *copyFilePath(const std::string& path) {
+    char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
+    if (file_path == NULL) {
+        fprintf(stderr, "Error: failed to allocate memory for file path\n");
+        exit(EXIT_FAILURE);
+    }
+    std::memcpy(file_path, path.c_str(), path.length() + 1);
+    return file_path;
+}
+
+inline char *findSampleFile(const char *filename, const char *executable_path) {
+    if (executable_path != NULL) {
+        std::filesystem::path executable_dir =
+            std::filesystem::path(executable_path).parent_path();
+        if (!executable_dir.empty()) {
+            std::filesystem::path candidate = executable_dir / filename;
+            if (std::filesystem::exists(candidate)) {
+                return copyFilePath(candidate.string());
+            }
+        }
+    }
+
+    return sdkFindFilePath(filename, executable_path);
+}
+
+inline SearchSpace loadSearchSpace(const char *filename) {
+    std::ifstream input(filename);
+    if (!input.is_open()) {
+        fprintf(stderr, "Error: unable to open search space file %s\n", filename);
+        exit(EXIT_FAILURE);
+    }
+
+    SearchSpace search_space;
+    std::string line;
+    int line_number = 0;
+
+    while (std::getline(input, line)) {
+        line_number++;
+        size_t comment = line.find('#');
+        if (comment != std::string::npos) {
+            line.resize(comment);
+        }
+
+        std::istringstream tokens(line);
+        std::string directive;
+        if (!(tokens >> directive)) {
+            continue;
+        }
+
+        std::vector<std::string> values;
+        std::string value;
+        while (tokens >> value) {
+            values.push_back(value);
+        }
+
+        if (directive == "tile") {
+            if (values.size() != 3) {
+                searchSpaceError(filename, line_number,
+                                 "tile expects block_m block_n block_k");
+            }
+
+            TileConfig tile = {};
+            if (!parsePositiveInt(values[0], &tile.block_m) ||
+                !parsePositiveInt(values[1], &tile.block_n) ||
+                !parsePositiveInt(values[2], &tile.block_k)) {
+                searchSpaceError(filename, line_number,
+                                 "tile values must be positive integers");
+            }
+            search_space.tile_options.push_back(tile);
+        } else if (directive == "load_latency") {
+            if (values.empty()) {
+                searchSpaceError(filename, line_number,
+                                 "load_latency expects at least one value");
+            }
+            for (const auto& option : values) {
+                int latency = 0;
+                if (!parsePositiveInt(option, &latency)) {
+                    searchSpaceError(filename, line_number,
+                                     "load_latency values must be positive integers");
+                }
+                search_space.load_latency_options.push_back(latency);
+            }
+        } else if (directive == "store_latency") {
+            if (values.empty()) {
+                searchSpaceError(filename, line_number,
+                                 "store_latency expects at least one value");
+            }
+            for (const auto& option : values) {
+                int latency = 0;
+                if (!parsePositiveInt(option, &latency)) {
+                    searchSpaceError(filename, line_number,
+                                     "store_latency values must be positive integers");
+                }
+                search_space.store_latency_options.push_back(latency);
+            }
+        } else {
+            searchSpaceError(filename, line_number,
+                             "unknown search space directive '" + directive + "'");
+        }
+    }
+
+    if (search_space.tile_options.empty()) {
+        fprintf(stderr, "Error: search space file %s does not list any tile entries\n", filename);
+        exit(EXIT_FAILURE);
+    }
+    if (search_space.load_latency_options.empty()) {
+        fprintf(stderr, "Error: search space file %s does not list any load_latency entries\n",
+                filename);
+        exit(EXIT_FAILURE);
+    }
+    if (search_space.store_latency_options.empty()) {
+        fprintf(stderr, "Error: search space file %s does not list any store_latency entries\n",
+                filename);
+        exit(EXIT_FAILURE);
+    }
+
+    return search_space;
+}
diff --git a/cpp/9_CUDA_Tile/tileMatmulAutotuner/backend_nvcc.h b/cpp/9_CUDA_Tile/tileMatmulAutotuner/backend_nvcc.h
new file mode 100644
index 00000000..58a7c452
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileMatmulAutotuner/backend_nvcc.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "backend_common.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <filesystem>
+#include <iostream>
+#include <string>
+#include <system_error>
+#include <vector>
+
+inline CompiledKernel compileFileWithNVCC(const char *filename,
+                                          int sm_value,
+                                          int block_m, int block_n, int block_k,
+                                          const std::vector<std::string>& extra_flags) {
+    // Check CUDA include path for cuda_fp16.h
+    const char *include_path = CUDA_INCLUDE_PATH;
+    if (include_path[0] == '\0') {
+      printf("\n ERROR: unable to locate CUDA include directory containing cuda_fp16.h\n");
+      exit(EXIT_FAILURE);
+    }
+
+    std::filesystem::path keep_dir = makeTempPath("matmul_nvcc_keep", "");
+    std::error_code ec;
+    if (!std::filesystem::create_directory(keep_dir, ec)) {
+        std::cerr << "\nerror: unable to create " << keep_dir.string()
+                  << " (" << ec.message() << ")\n";
+        exit(EXIT_FAILURE);
+    }
+
+    std::string base = baseNameWithoutExtension(filename);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    const char *object_suffix = ".obj";
+#else
+    const char *object_suffix = ".o";
+#endif
+    std::filesystem::path object_file = keep_dir / (base + object_suffix);
+    std::filesystem::path tile_cubin_file = keep_dir / (base + ".tile.cubin");
+
+    std::vector<std::string> args = {
+        NVCC_PATH,
+        "--enable-tile",
+        "-std=c++20",
+        "-arch=sm_" + std::to_string(sm_value),
+        "-lineinfo",
+        "-c",
+        filename,
+        "-o",
+        object_file.string(),
+        "--keep",
+        "--keep-dir",
+        keep_dir.string(),
+        "-I",
+        include_path
+    };
+    appendTileBlockMacroOptions(args, block_m, block_n, block_k);
+
+    for (const auto& flag : extra_flags) {
+        args.push_back(flag);
+    }
+
+    std::string cmd = joinShellCommand(args);
+    std::cerr << "\nCompiling file with NVCC\n";
+    int ret = system(cmd.c_str());
+    if (ret != 0) {
+        fprintf(stderr, "Error: nvcc compilation failed with code %d\n", ret);
+        fprintf(stderr, "Command: %s\n", cmd.c_str());
+        std::filesystem::remove_all(keep_dir);
+        exit(EXIT_FAILURE);
+    }
+
+    if (!std::filesystem::exists(tile_cubin_file)) {
+        fprintf(stderr, "Error: nvcc did not produce expected Tile cubin %s\n",
+                tile_cubin_file.string().c_str());
+        std::filesystem::remove_all(keep_dir);
+        exit(EXIT_FAILURE);
+    }
+
+    CompiledKernel kernel;
+    kernel.image = readBinaryFile(tile_cubin_file.string());
+    std::filesystem::remove_all(keep_dir);
+    return kernel;
+}
diff --git a/cpp/9_CUDA_Tile/tileMatmulAutotuner/backend_nvrtc.h b/cpp/9_CUDA_Tile/tileMatmulAutotuner/backend_nvrtc.h
new file mode 100644
index 00000000..85619461
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileMatmulAutotuner/backend_nvrtc.h
@@ -0,0 +1,186 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "backend_common.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <nvrtc.h>
+
+#define NVRTC_SAFE_CALL(Name, x)                                             \
+    do {                                                                     \
+        nvrtcResult result = x;                                              \
+        if (result != NVRTC_SUCCESS) {                                       \
+            std::cerr << "\nerror: " << Name << " failed with error " <<     \
+                      nvrtcGetErrorString(result);                           \
+            exit(EXIT_FAILURE);                                              \
+        }                                                                    \
+    } while(0)
+
+inline std::vector<char> compileTileIRToCubin(const char *tileIR,
+                                              size_t tileIRSize,
+                                              int sm_value) {
+    // Generate unique temporary file names using PID, timestamp, and a local counter.
+    std::string tileir_file = makeTempPath("tileir", ".bc");
+    std::string cubin_file = makeTempPath("cubin", ".cubin");
+
+    // Write TileIR to a temporary file because tileiras consumes files rather than memory buffers.
+    FILE* fp = fopen(tileir_file.c_str(), "wb");
+    if (!fp) {
+        fprintf(stderr, "Error: failed to open %s for writing\n", tileir_file.c_str());
+        exit(EXIT_FAILURE);
+    }
+    fwrite(tileIR, 1, tileIRSize, fp);
+    fclose(fp);
+
+    /*
+     * Ideally we would use the Driver API to compile TileIR to cubin with the
+     * latest driver installed. However, to avoid the hassle of upgrading the
+     * driver, we use tileiras for now, which is handily available in CUDA
+     * Toolkit.
+     *
+     * The Driver API path would look like:
+     * (requires #include <helper_cuda_drvapi.h>)
+     *
+     *     CUmodule module;
+     *     CUjit_option options[] = {
+     *         CU_JIT_GENERATE_LINE_INFO
+     *     };
+     *     void* optionValues[] = {
+     *         (void*)(uintptr_t)1,  // line info
+     *     };
+     *     unsigned int numOptions = sizeof(options) / sizeof(options[0]);
+     *     checkCudaErrors(cuModuleLoadDataEx(&module, tileIR, numOptions,
+     *                                        options, optionValues));
+     */
+
+    // Compile TileIR to cubin using tileiras from the configured CUDA Toolkit.
+    // This happens before benchmarking so the timed path matches the NVCC backend.
+    std::string cmd = joinShellCommand({
+        TILEIRAS_PATH,
+        "-arch=sm_" + std::to_string(sm_value),
+        tileir_file,
+        "-o",
+        cubin_file
+    });
+    int ret = system(cmd.c_str());
+    if (ret != 0) {
+        fprintf(stderr, "Error: tileiras compilation failed with code %d\n", ret);
+        fprintf(stderr, "Command: %s\n", cmd.c_str());
+        remove(tileir_file.c_str());
+        remove(cubin_file.c_str());
+        exit(EXIT_FAILURE);
+    }
+
+    // Read the generated cubin into memory so later benchmark iterations do not depend on temp files.
+    std::vector<char> cubin = readBinaryFile(cubin_file);
+
+    // Remove temporary files after the cubin has been captured.
+    remove(tileir_file.c_str());
+    remove(cubin_file.c_str());
+    return cubin;
+}
+
+inline CompiledKernel compileFileWithNVRTC(const char *filename,
+                                           int sm_value,
+                                           int block_m, int block_n, int block_k,
+                                           const std::vector<std::string>& extra_flags) {
+    // Check include path for cuda_fp16.h
+    const char *ptr = CUDA_INCLUDE_PATH;
+    if (ptr[0] == '\0') {
+      printf("\n ERROR: unable to locate CUDA include directory containing cuda_fp16.h\n");
+      exit(EXIT_FAILURE);
+    }
+    std::vector<std::string> option_storage = {
+        "-I",
+        ptr,
+        "-std=c++20",
+        "-enable-tile",
+        "-lineinfo",
+        "-arch=compute_" + std::to_string(sm_value)
+    };
+    appendTileBlockMacroOptions(option_storage, block_m, block_n, block_k);
+    for (const auto& flag : extra_flags) {
+      option_storage.push_back(flag);
+    }
+
+    std::vector<const char*> argv_vec;
+    for (const auto& option : option_storage) {
+      argv_vec.push_back(option.c_str());
+    }
+    const char **argv = argv_vec.data();
+    int argc = static_cast<int>(argv_vec.size());
+    std::cerr << "\nCompiling file with NVRTC\n";
+    std::ifstream inputFile(filename, std::ios::in | std::ios::binary |
+              std::ios::ate);
+    if (!inputFile.is_open()) {
+    	std::cerr << "\nerror: unable to open " << filename << " for reading!\n";
+    	exit(EXIT_FAILURE);
+    }
+    std::streampos pos = inputFile.tellg();
+    size_t inputSize = pos;
+    char * memBlock = new char [inputSize + 1];
+    inputFile.seekg (0, std::ios::beg);
+    inputFile.read (memBlock, inputSize);
+    inputFile.close();
+    memBlock[inputSize] = '\x0';
+
+    // Compile the source string to PTX and Tile IR.
+    nvrtcProgram prog;
+    NVRTC_SAFE_CALL("nvrtcCreateProgram", nvrtcCreateProgram(&prog, memBlock,
+    "testprog", 0, NULL, NULL));
+    nvrtcResult res = nvrtcCompileProgram(prog, argc, argv);
+    // Dump the NVRTC compilation log
+    size_t logSize;
+    NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", nvrtcGetProgramLogSize(prog, &logSize));
+    char* log = (char*)malloc(sizeof(char) * logSize + 1);
+    NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log));
+    log[logSize] = '\x0';
+    std::cerr << "\n compilation log ---\n";
+    std::cerr << log;
+    std::cerr << "\n end log ---\n\n";
+    free(log);
+    NVRTC_SAFE_CALL("nvrtcCompileProgram", res);
+
+    // Fetch Tile IR and compile it to cubin before benchmarking.
+    size_t tileIRSize;
+    NVRTC_SAFE_CALL("nvrtcGetTileIRSize", nvrtcGetTileIRSize(prog, &tileIRSize));
+    std::vector<char> tileIR(tileIRSize);
+    NVRTC_SAFE_CALL("nvrtcGetTileIR", nvrtcGetTileIR(prog, tileIR.data()));
+    CompiledKernel kernel;
+    kernel.image = compileTileIRToCubin(tileIR.data(), tileIR.size(), sm_value);
+    NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog));
+    delete[] memBlock;
+    return kernel;
+}
diff --git a/cpp/9_CUDA_Tile/tileMatmulAutotuner/matmul.cu b/cpp/9_CUDA_Tile/tileMatmulAutotuner/matmul.cu
new file mode 100644
index 00000000..0fa4aebb
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileMatmulAutotuner/matmul.cu
@@ -0,0 +1,101 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * CUDA Tile C++ matrix multiplication kernel used by tileMatmulAutotuner.
+ *
+ * This sample implements a tiled FP16 -> FP32 matrix multiplication with
+ * ct::partition_view and ct::mma. The autotuner compiles this file repeatedly
+ * with TILE_BLOCK_M, TILE_BLOCK_N, TILE_BLOCK_K, LOAD_LATENCY, and
+ * STORE_LATENCY defined on the compiler command line.
+ *
+ * Approach:
+ *   - Uses ct::tensor_span and ct::partition_view for blocked access.
+ *   - Uses a K-dimension accumulation loop with ct::mma.
+ *   - Loads FP16 inputs into tiles and accumulates into FP32.
+ */
+
+#include "cuda_tile.h"
+#include <cuda_fp16.h>
+
+namespace ct = cuda::tiles;
+
+extern "C" __tile_global__ void matmul_tile(float* __restrict__ _C,
+                                             const __half* __restrict__ _A,
+                                             const __half* __restrict__ _B,
+                                             int _M, int _N, int _K) {
+    float* C = ct::assume_aligned<16>(_C);
+    const __half* A = ct::assume_aligned<16>(_A);
+    const __half* B = ct::assume_aligned<16>(_B);
+    auto M = ct::assume_divisible<16>(_M);
+    auto N = ct::assume_divisible<16>(_N);
+    auto K = ct::assume_divisible<16>(_K);
+    
+    // Create tensor spans with runtime shapes (FP16 for A and B)
+    auto a_span = ct::tensor_span{A, ct::extents{M, K}};
+    auto b_span = ct::tensor_span{B, ct::extents{K, N}};
+    auto c_span = ct::tensor_span{C, ct::extents{M, N}};
+    
+    // Create partition views with compile-time block sizes
+    auto a_view = ct::partition_view{a_span, ct::shape<TILE_BLOCK_M, TILE_BLOCK_K>{}};
+    auto b_view = ct::partition_view{b_span, ct::shape<TILE_BLOCK_K, TILE_BLOCK_N>{}};
+    auto c_view = ct::partition_view{c_span, ct::shape<TILE_BLOCK_M, TILE_BLOCK_N>{}};
+    
+    // get block indices from the 2D grid
+    auto [pid_m, pid_n, dummy] = ct::bid();
+    
+    // initialize FP32 accumulator
+    auto acc = ct::zeros<ct::tile<float, ct::shape<TILE_BLOCK_M, TILE_BLOCK_N>>>();
+    
+    // loop over the K dimension in blocks
+    int num_k_blocks = (K + TILE_BLOCK_K - 1) / TILE_BLOCK_K;
+    for (auto k_block : ct::irange(0, num_k_blocks)) {
+        ct::tile<__half, ct::shape<TILE_BLOCK_M, TILE_BLOCK_K>> a_tile;
+        ct::tile<__half, ct::shape<TILE_BLOCK_K, TILE_BLOCK_N>> b_tile;
+        
+        // load blocks of A and B (FP16)
+        [[
+            cutile::hint(0, latency=LOAD_LATENCY),
+        ]]
+        a_tile = a_view.load(pid_m, k_block);
+
+        [[
+            cutile::hint(0, latency=LOAD_LATENCY),
+        ]] 
+        b_tile = b_view.load(k_block, pid_n);
+        
+        // accumulate: acc += A_block @ B_block (FP16 inputs, FP32 accumulator)
+        // ct::mma handles mixed precision: FP16 operands with FP32 accumulator.
+        acc = ct::mma(a_tile, b_tile, acc);
+    }
+    
+    // store result (FP32)
+    [[
+        cutile::hint(0, latency=STORE_LATENCY),
+    ]]
+    c_view.store(acc, pid_m, pid_n);
+}
diff --git a/cpp/9_CUDA_Tile/tileMatmulAutotuner/matmul_autotuner.cpp b/cpp/9_CUDA_Tile/tileMatmulAutotuner/matmul_autotuner.cpp
new file mode 100644
index 00000000..0acb8ebf
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileMatmulAutotuner/matmul_autotuner.cpp
@@ -0,0 +1,331 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * CUDA Tile C++ matrix multiplication autotuner.
+ *
+ * This sample is the host-side driver for the tiled matrix multiplication
+ * kernel in matmul.cu. It compiles that kernel repeatedly with different
+ * TILE_BLOCK_M, TILE_BLOCK_N, TILE_BLOCK_K, LOAD_LATENCY, and STORE_LATENCY
+ * values configured in a search-space file, derives the launch grid from the
+ * selected tile size, and reports the fastest configuration for the requested
+ * problem size.
+ *
+ * Backend flow:
+ *   - NVRTC compiles matmul.cu to TileIR, then invokes tileiras to produce a
+ *     cubin image.
+ *   - NVCC compiles matmul.cu as a standalone source file and reuses the
+ *     generated Tile cubin artifact.
+ *   - Both paths load the resulting cubin with the CUDA Driver API and launch
+ *     the same matmul_tile entry point.
+ */
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+
+#include "backend_common.h"
+#include "backend_nvcc.h"
+#include "backend_nvrtc.h"
+#include "matmul_benchmark.h"
+#include <helper_cuda_drvapi.h>
+
+// global SM value (compute capability)
+static int smValue = 0;
+static constexpr const char *kMatmulKernelName = "matmul_tile";
+
+CompilerBackend parseCompilerBackendValue(const char *value) {
+    if (std::strcmp(value, "nvrtc") == 0) {
+        return CompilerBackend::NVRTC;
+    }
+    if (std::strcmp(value, "nvcc") == 0) {
+        return CompilerBackend::NVCC;
+    }
+
+    fprintf(stderr, "Error: unsupported backend '%s'\n", value);
+    fprintf(stderr, "Expected 'nvrtc' or 'nvcc'.\n");
+    exit(EXIT_FAILURE);
+}
+
+void printCompilerOptions() {
+    printf("Backend options:\n");
+    printf("  --backend=nvrtc|nvcc   Select backend (default: NVRTC)\n");
+    printf("\n");
+}
+
+CompilerBackend parseCompilerBackendArgs(int argc, char** argv, std::vector<char*>& benchmark_argv) {
+    CompilerBackend compiler_backend = CompilerBackend::NVRTC;
+    benchmark_argv.clear();
+    benchmark_argv.push_back(argv[0]);
+
+    for (int i = 1; i < argc; i++) {
+        if (std::strcmp(argv[i], "--help") == 0 || std::strcmp(argv[i], "-h") == 0) {
+            printCompilerOptions();
+            benchmark_argv.push_back(argv[i]);
+        } else if (std::strncmp(argv[i], "--backend=", 10) == 0) {
+            compiler_backend = parseCompilerBackendValue(argv[i] + 10);
+        } else if (std::strcmp(argv[i], "--backend") == 0) {
+            if (i + 1 >= argc) {
+                fprintf(stderr, "Error: %s requires an argument\n", argv[i]);
+                exit(EXIT_FAILURE);
+            }
+            compiler_backend = parseCompilerBackendValue(argv[++i]);
+        } else {
+            benchmark_argv.push_back(argv[i]);
+        }
+    }
+
+    return compiler_backend;
+}
+
+void setSMValue() {
+    CUdevice device;
+    int major = 0, minor = 0;
+
+    // initialize the CUDA Driver API
+    checkCudaErrors(cuInit(0));
+
+    // get the first device (device 0)
+    checkCudaErrors(cuDeviceGet(&device, 0));
+    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
+
+    printf("GPU Compute Capability: %d.%d\n", major, minor);
+    smValue = major * 10 + minor;
+}
+
+CompiledKernel compileFile(const char *filename,
+                           int block_m, int block_n, int block_k,
+                           CompilerBackend compiler_backend,
+                           const std::vector<std::string>& extra_flags = {}) {
+    if (compiler_backend == CompilerBackend::NVCC) {
+        return compileFileWithNVCC(filename, smValue, block_m, block_n, block_k, extra_flags);
+    }
+    return compileFileWithNVRTC(filename, smValue, block_m, block_n, block_k, extra_flags);
+}
+
+void loadAndExecuteKernel(const CompiledKernel& compiled_kernel,
+                          CUdeviceptr d_A, CUdeviceptr d_B, CUdeviceptr d_C,
+                          int M, int N, int K,
+                          unsigned int gridDimX, unsigned int gridDimY, unsigned int sMem) {
+    CUmodule module;
+    CUfunction kernel_addr;
+
+    void* args[] = {
+        (void*)&d_C,
+        (void*)&d_A,
+        (void*)&d_B,
+        (void*)&M,
+        (void*)&N,
+        (void*)&K
+    };
+
+    checkCudaErrors(cuModuleLoadData(&module, compiled_kernel.image.data()));
+
+    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, kMatmulKernelName));
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
+                gridDimX, gridDimY, 1,  // grid dim
+                1, 1, 1,                // block dim
+                sMem, 0,                // shared mem, stream
+                args,                   // arguments
+                NULL));
+    checkCudaErrors(cuCtxSynchronize());
+
+    // cleanup
+    checkCudaErrors(cuModuleUnload(module));
+}
+
+void autotuner(int M, int N, int K,
+               const char *kernel_file,
+               const SearchSpace& search_space,
+               CompilerBackend compiler_backend) {
+    printf("\n=== Matrix: C[%dx%d] = A[%dx%d] x B[%dx%d] (FP16->FP32) ===\n",
+           M, N, M, K, K, N);
+    printf("    FLOPs: %.2f GFLOP\n", 2.0 * M * N * K / 1e9);
+
+    // allocate and initialize (FP16 for A and B)
+    std::vector<__half> h_A(M * K), h_B(K * N);
+    std::vector<float> h_C;
+
+    srand(42);
+    for (int i = 0; i < M * K; i++) h_A[i] = __float2half((float)rand() / RAND_MAX - 0.5f);
+    for (int i = 0; i < K * N; i++) h_B[i] = __float2half((float)rand() / RAND_MAX - 0.5f);
+
+    // compute the CPU reference unless validation is disabled
+    if (use_validation()) {
+        h_C.resize(M * N);
+        matmul_cpu(h_C.data(), h_A.data(), h_B.data(), M, N, K);
+    }
+
+    // allocate device memory
+    CUdeviceptr d_A, d_B, d_C;
+    checkCudaErrors(cuMemAlloc(&d_A, M * K * sizeof(__half)));
+    checkCudaErrors(cuMemAlloc(&d_B, K * N * sizeof(__half)));
+    checkCudaErrors(cuMemAlloc(&d_C, M * N * sizeof(float)));
+    checkCudaErrors(cuMemcpyHtoD(d_A, h_A.data(), M * K * sizeof(__half)));
+    checkCudaErrors(cuMemcpyHtoD(d_B, h_B.data(), K * N * sizeof(__half)));
+
+    // helper lambda to clear output
+    auto clear_output = [&]() {
+        std::vector<float> zeros(M * N, 0.0f);
+        checkCudaErrors(cuMemcpyHtoD(d_C, zeros.data(), M * N * sizeof(float)));
+    };
+
+    struct AutotuneResult {
+        int block_m;
+        int block_n;
+        int block_k;
+        int grid_x;
+        int grid_y;
+        int load_latency;
+        int store_latency;
+        BenchmarkResult result;
+    };
+    std::vector<AutotuneResult> autotune_results;
+
+    size_t config_count = 0;
+    size_t total_configs = search_space.tile_options.size() *
+                           search_space.load_latency_options.size() *
+                           search_space.store_latency_options.size();
+
+    for (const auto& tile : search_space.tile_options) {
+        int grid_x = ceilDiv(M, tile.block_m);
+        int grid_y = ceilDiv(N, tile.block_n);
+        for (int load_lat : search_space.load_latency_options) {
+            for (int store_lat : search_space.store_latency_options) {
+                config_count++;
+                printf("  [%zu/%zu] ", config_count, total_configs);
+
+                std::vector<std::string> compile_flags = {
+                    "-DLOAD_LATENCY=" + std::to_string(load_lat),
+                    "-DSTORE_LATENCY=" + std::to_string(store_lat)
+                };
+
+                CompiledKernel compiled_kernel = compileFile(kernel_file,
+                                                             tile.block_m, tile.block_n, tile.block_k,
+                                                             compiler_backend,
+                                                             compile_flags);
+
+                clear_output();
+
+                std::string config_name = "bm=" + std::to_string(tile.block_m) +
+                                          ",bn=" + std::to_string(tile.block_n) +
+                                          ",bk=" + std::to_string(tile.block_k) +
+                                          ",gx=" + std::to_string(grid_x) +
+                                          ",gy=" + std::to_string(grid_y) +
+                                          ",ld=" + std::to_string(load_lat) +
+                                          ",st=" + std::to_string(store_lat);
+                auto result = run_benchmark(config_name.c_str(),
+                    [&]() {
+                        loadAndExecuteKernel(compiled_kernel, d_A, d_B, d_C, M, N, K,
+                                             grid_x, grid_y, 0);
+                    },
+                    [&]() {
+                        std::vector<float> h_result(M * N);
+                        checkCudaErrors(cuMemcpyDtoH(h_result.data(), d_C,
+                                                     M * N * sizeof(float)));
+                        return verify_matmul_result(config_name.c_str(),
+                                                    h_result.data(), h_C.data(), M, N);
+                    },
+                    M, N, K);
+                print_result(result);
+
+                autotune_results.push_back({tile.block_m, tile.block_n, tile.block_k,
+                                            grid_x, grid_y, load_lat, store_lat, result});
+            }
+        }
+    }
+
+    // find the best configuration by GFLOPS
+    auto best = std::max_element(autotune_results.begin(), autotune_results.end(),
+        [](const AutotuneResult& a, const AutotuneResult& b) {
+            return a.result.gflops < b.result.gflops;
+        });
+
+    printf("\n  *** BEST CONFIGURATION ***\n");
+    printf("  BLOCK_M=%d, BLOCK_N=%d, BLOCK_K=%d\n",
+           best->block_m, best->block_n, best->block_k);
+    printf("  LOAD_LATENCY=%d, STORE_LATENCY=%d, grid_x=%d, grid_y=%d\n",
+           best->load_latency, best->store_latency, best->grid_x, best->grid_y);
+    printf("  Performance: %.1f GFLOPS, %.3f ms, %.1f GB/s\n",
+           best->result.gflops, best->result.time_ms, best->result.bandwidth_gb_s);
+
+    checkCudaErrors(cuMemFree(d_A));
+    checkCudaErrors(cuMemFree(d_B));
+    checkCudaErrors(cuMemFree(d_C));
+}
+
+int main(int argc, char** argv) {
+    std::vector<char*> benchmark_argv;
+    CompilerBackend compiler_backend = parseCompilerBackendArgs(argc, argv, benchmark_argv);
+    parse_benchmark_args(static_cast<int>(benchmark_argv.size()), benchmark_argv.data());
+    print_device_info();
+
+    // initialize CUDA and get compute capability
+    setSMValue();
+
+    CUcontext context;
+    CUdevice cuDevice = 0;
+    checkCudaErrors(cuInit(0));
+    checkCudaErrors(cuDeviceGet(&cuDevice, 0));
+    checkCudaErrors(cuCtxCreate(&context, NULL, 0, cuDevice));
+
+    printf("\nMatrix Multiplication Autotuner (FP16 inputs, FP32 accumulate)\n");
+    printf("==============================================================\n");
+    printf("Backend: %s\n", compilerBackendName(compiler_backend));
+
+    char *kernel_file = findSampleFile("matmul.cu", argv[0]);
+    if (kernel_file == NULL) {
+        fprintf(stderr, "Error: unable to locate matmul.cu\n");
+        return 1;
+    }
+
+    char *search_space_file = findSampleFile(kSearchSpaceFileName, argv[0]);
+    if (search_space_file == NULL) {
+        fprintf(stderr, "Error: unable to locate %s\n", kSearchSpaceFileName);
+        free(kernel_file);
+        return 1;
+    }
+
+    SearchSpace search_space = loadSearchSpace(search_space_file);
+    printf("Search space: %s\n", search_space_file);
+
+    printf("Tuning for M=1024, N=4096, K=1024\n");
+    autotuner(1024, 4096, 1024, kernel_file, search_space, compiler_backend);
+    free(kernel_file);
+    free(search_space_file);
+
+    return 0;
+}
diff --git a/cpp/9_CUDA_Tile/tileRope/CMakeLists.txt b/cpp/9_CUDA_Tile/tileRope/CMakeLists.txt
new file mode 100644
index 00000000..e9c0d7f7
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileRope/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(tileRope LANGUAGES C CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_CUDA_ARCHITECTURES 80 86 87 89 90 100 110 120)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --enable-tile")
+
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+add_executable(tileRope tileRope.cu)
+
+target_compile_features(tileRope PRIVATE cxx_std_20 cuda_std_20)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/9_CUDA_Tile/tileRope/README.md b/cpp/9_CUDA_Tile/tileRope/README.md
new file mode 100644
index 00000000..43cf131b
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileRope/README.md
@@ -0,0 +1,29 @@
+# tileRope
+
+## Description
+
+This sample demonstrates a Rotary Position Embedding (RoPE) forward pass
+using CUDA Tile C++. RoPE injects positional information into the query
+and key projections of an attention layer by rotating pairs of features
+in the head dimension by per-position angles. This implementation uses
+the split-half convention: for each token at position `s` the pair
+`(q[i], q[i + D/2])` is rotated by `theta = s * 10000^(-2i / D)`, so
+`q[i]' = q[i]*cos(theta) - q[i+D/2]*sin(theta)` and
+`q[i+D/2]' = q[i]*sin(theta) + q[i+D/2]*cos(theta)`. The
+`cuda::tiles::partition_view` type is used to partition each (batch,
+position) token's Q and K tensors into 2D tiles over (heads,
+half_rope_dim), and a single block processes all heads for one token
+in parallel, writing the result back in place. A SIMT kernel is used
+to initialize the inputs and the cos/sin tables.
+
+## Expected Output
+
+```
+Success! RoPE matches expected results.
+```
+
+## Prerequisites
+
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) version 13.3 or later.
+- [CUDA Driver](https://www.nvidia.com/en-us/drivers/) version 580 or later.
+- Host compiler with C++20 support.
diff --git a/cpp/9_CUDA_Tile/tileRope/tileRope.cu b/cpp/9_CUDA_Tile/tileRope/tileRope.cu
new file mode 100644
index 00000000..8356d849
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileRope/tileRope.cu
@@ -0,0 +1,274 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample demonstrates a Rotary Position Embedding (RoPE) forward pass
+ * using CUDA Tile C++. RoPE injects positional information into the query
+ * and key projections of an attention layer by rotating pairs of features
+ * in the head dimension by per-position angles. This implementation uses
+ * the split-half convention: for each token at position 's' the pair
+ * (q[i], q[i + D/2]) is rotated by theta = s * 10000^(-2i / D), yielding
+ *     q[i]'       = q[i] * cos(theta) - q[i + D/2] * sin(theta)
+ *     q[i + D/2]' = q[i] * sin(theta) + q[i + D/2] * cos(theta)
+ * Each block handles one (batch, position) token and processes all heads
+ * in parallel using 2D tiles over (heads, half_rope_dim). The kernel
+ * writes back to q and k in place. A SIMT kernel is used to initialize
+ * the inputs and the cos/sin tables.
+ */
+
+#include "helper_cuda.h"
+
+#include "cuda_tile.h"
+#include "cuda_fp16.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+/* Compile-time sample shape: one block per (batch, position) token. */
+constexpr int BATCH         = 1;
+constexpr int Q_HEADS       = 8;
+constexpr int K_HEADS       = 8;
+constexpr int SEQ_LEN       = 64;
+constexpr int HEAD_DIM      = 64;
+constexpr int HALF_ROPE_DIM = HEAD_DIM / 2;
+constexpr int BLOCK_QH      = Q_HEADS;
+constexpr int BLOCK_KH      = K_HEADS;
+constexpr int BLOCK_HD      = HALF_ROPE_DIM;
+constexpr int COS_BS        = 1;
+
+constexpr std::size_t Q_SIZE   = (std::size_t)BATCH * Q_HEADS * SEQ_LEN * HEAD_DIM;
+constexpr std::size_t K_SIZE   = (std::size_t)BATCH * K_HEADS * SEQ_LEN * HEAD_DIM;
+constexpr std::size_t COS_SIZE = (std::size_t)COS_BS * SEQ_LEN * HALF_ROPE_DIM;
+
+constexpr std::size_t cmax(std::size_t a, std::size_t b) { return a > b ? a : b; }
+constexpr std::size_t INIT_N = cmax(Q_SIZE, cmax(K_SIZE, COS_SIZE));
+
+/* Initializes q, k and the cos/sin tables on device. The cos/sin tables
+ * are laid out as (COS_BS, SEQ_LEN, HALF_ROPE_DIM): one entry per
+ * (batch, position, frequency-index) triple. */
+__global__ void initializeInputs(__half* q, __half* k, __half* cos, __half* sin) {
+  std::size_t tid = (std::size_t)blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (tid < Q_SIZE) {
+    int d = (int)(tid % HEAD_DIM);
+    q[tid] = __half{float(d % 11) / 10.0f - 0.5f};
+  }
+  if (tid < K_SIZE) {
+    int d = (int)(tid % HEAD_DIM);
+    k[tid] = __half{float(d % 13) / 10.0f - 0.5f};
+  }
+  if (tid < COS_SIZE) {
+    int i = (int)(tid % HALF_ROPE_DIM);
+    int s = (int)((tid / HALF_ROPE_DIM) % SEQ_LEN);
+    float exponent = -2.0f * float(i) / float(HEAD_DIM);
+    float theta = float(s) * powf(10000.0f, exponent);
+    cos[tid] = __half{cosf(theta)};
+    sin[tid] = __half{sinf(theta)};
+  }
+}
+
+/* RoPE forward kernel - processes all heads at once using 2D tiles. One
+ * block handles one (batch, seq) position. Tile 0 along the head_dim axis
+ * spans [0:BLOCK_HD) (the first half) and tile 1 spans [BLOCK_HD:2*BLOCK_HD)
+ * (the second half), so the rotation pairs are (q[i], q[i + D/2]). */
+template <typename T, int BATCH_, int Q_HEADS_, int K_HEADS_,
+          int BLOCK_QH_, int BLOCK_KH_, int BLOCK_HD_,
+          int HALF_ROPE_DIM_, int HEAD_DIM_, int COS_BS_, int SEQ_LEN_>
+__tile_global__ void rope(T* __restrict__ q,
+                          T* __restrict__ k,
+                          T* __restrict__ cos,
+                          T* __restrict__ sin) {
+  namespace ct = cuda::tiles;
+
+  q   = ct::assume_aligned<16>(q);
+  k   = ct::assume_aligned<16>(k);
+  cos = ct::assume_aligned<16>(cos);
+  sin = ct::assume_aligned<16>(sin);
+
+  int pid           = ct::bid().x;
+  int batch_idx     = pid / SEQ_LEN_;
+  int row_idx       = pid % SEQ_LEN_;
+  int cos_batch_idx = (COS_BS_ == 1) ? 0 : batch_idx;
+
+  auto pCos = ct::partition_view(ct::tensor_span{cos, ct::extents{COS_BS_, SEQ_LEN_, HALF_ROPE_DIM_}},
+                                 ct::shape<1, 1, BLOCK_HD_>{});
+  auto cos_loaded = pCos.load(cos_batch_idx, row_idx, 0);
+  auto cos_row    = ct::reshape<ct::shape<1, BLOCK_HD_>>(cos_loaded);
+
+  auto pSin = ct::partition_view(ct::tensor_span{sin, ct::extents{COS_BS_, SEQ_LEN_, HALF_ROPE_DIM_}},
+                                 ct::shape<1, 1, BLOCK_HD_>{});
+  auto sin_loaded = pSin.load(cos_batch_idx, row_idx, 0);
+  auto sin_row    = ct::reshape<ct::shape<1, BLOCK_HD_>>(sin_loaded);
+
+  /* Process Q. Tile indices 0 and 1 along the head_dim axis cover
+   * [0:2*BLOCK_HD) == [0:rope_dim); elements past rope_dim are unchanged. */
+  auto pQ = ct::partition_view(ct::tensor_span{q, ct::extents{BATCH_, Q_HEADS_, SEQ_LEN_, HEAD_DIM_}},
+                               ct::shape<1, BLOCK_QH_, 1, BLOCK_HD_>{});
+  auto q_tile_1_loaded = pQ.load(batch_idx, 0, row_idx, 0);
+  auto q_tile_1        = ct::reshape<ct::shape<BLOCK_QH_, BLOCK_HD_>>(q_tile_1_loaded);
+
+  auto q_tile_2_loaded = pQ.load(batch_idx, 0, row_idx, 1);
+  auto q_tile_2        = ct::reshape<ct::shape<BLOCK_QH_, BLOCK_HD_>>(q_tile_2_loaded);
+
+  auto cos_bcast_q = ct::broadcast<ct::shape<BLOCK_QH_, BLOCK_HD_>>(cos_row);
+  auto sin_bcast_q = ct::broadcast<ct::shape<BLOCK_QH_, BLOCK_HD_>>(sin_row);
+
+  /* y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin] */
+  auto new_q_tile_1 = q_tile_1 * cos_bcast_q - q_tile_2 * sin_bcast_q;
+  auto new_q_tile_2 = q_tile_2 * cos_bcast_q + q_tile_1 * sin_bcast_q;
+
+  auto new_q_tile_1_reshaped = ct::reshape<ct::shape<1, BLOCK_QH_, 1, BLOCK_HD_>>(new_q_tile_1);
+  auto new_q_tile_2_reshaped = ct::reshape<ct::shape<1, BLOCK_QH_, 1, BLOCK_HD_>>(new_q_tile_2);
+
+  pQ.store(new_q_tile_1_reshaped, batch_idx, 0, row_idx, 0);
+  pQ.store(new_q_tile_2_reshaped, batch_idx, 0, row_idx, 1);
+
+  /* Process K (in place). */
+  auto pK = ct::partition_view(ct::tensor_span{k, ct::extents{BATCH_, K_HEADS_, SEQ_LEN_, HEAD_DIM_}},
+                               ct::shape<1, BLOCK_KH_, 1, BLOCK_HD_>{});
+  auto k_tile_1_loaded = pK.load(batch_idx, 0, row_idx, 0);
+  auto k_tile_1        = ct::reshape<ct::shape<BLOCK_KH_, BLOCK_HD_>>(k_tile_1_loaded);
+
+  auto k_tile_2_loaded = pK.load(batch_idx, 0, row_idx, 1);
+  auto k_tile_2        = ct::reshape<ct::shape<BLOCK_KH_, BLOCK_HD_>>(k_tile_2_loaded);
+
+  auto cos_bcast_k = ct::broadcast<ct::shape<BLOCK_KH_, BLOCK_HD_>>(cos_row);
+  auto sin_bcast_k = ct::broadcast<ct::shape<BLOCK_KH_, BLOCK_HD_>>(sin_row);
+
+  auto new_k_tile_1 = k_tile_1 * cos_bcast_k - k_tile_2 * sin_bcast_k;
+  auto new_k_tile_2 = k_tile_2 * cos_bcast_k + k_tile_1 * sin_bcast_k;
+
+  auto new_k_tile_1_reshaped = ct::reshape<ct::shape<1, BLOCK_KH_, 1, BLOCK_HD_>>(new_k_tile_1);
+  auto new_k_tile_2_reshaped = ct::reshape<ct::shape<1, BLOCK_KH_, 1, BLOCK_HD_>>(new_k_tile_2);
+
+  pK.store(new_k_tile_1_reshaped, batch_idx, 0, row_idx, 0);
+  pK.store(new_k_tile_2_reshaped, batch_idx, 0, row_idx, 1);
+}
+
+/* CPU reference matching the split-half convention used by the kernel:
+ * (q[i], q[i + D/2]) is rotated by the angle stored at cos/sin[cb, s, i]. */
+static bool verify(const __half* h_in,
+                   const __half* h_out,
+                   const __half* h_cos,
+                   const __half* h_sin,
+                   int heads,
+                   const char* name) {
+  for (int b = 0; b < BATCH; ++b) {
+    for (int h = 0; h < heads; ++h) {
+      for (int s = 0; s < SEQ_LEN; ++s) {
+        for (int i = 0; i < HALF_ROPE_DIM; ++i) {
+          std::size_t base = (((std::size_t)b * heads + h) * SEQ_LEN + s) * HEAD_DIM;
+          std::size_t i1   = base + i;
+          std::size_t i2   = base + i + HALF_ROPE_DIM;
+          int cb           = (COS_BS == 1) ? 0 : b;
+          std::size_t ci   = ((std::size_t)cb * SEQ_LEN + s) * HALF_ROPE_DIM + i;
+
+          double q1 = (double)(float)h_in[i1];
+          double q2 = (double)(float)h_in[i2];
+          double c  = (double)(float)h_cos[ci];
+          double si = (double)(float)h_sin[ci];
+
+          __half exp1 = __half{(float)(q1 * c - q2 * si)};
+          __half exp2 = __half{(float)(q2 * c + q1 * si)};
+
+          float diff1 = std::fabs((float)h_out[i1] - (float)exp1);
+          float diff2 = std::fabs((float)h_out[i2] - (float)exp2);
+
+          if (diff1 > 1e-1f || diff2 > 1e-1f) {
+            printf("Mismatch in %s at (b=%d, h=%d, s=%d, i=%d):\n", name, b, h, s, i);
+            printf("  Expected: %s[%d,%d,%d,%d]=%f, %s[%d,%d,%d,%d]=%f\n",
+                   name, b, h, s, i,                  (float)exp1,
+                   name, b, h, s, i + HALF_ROPE_DIM,  (float)exp2);
+            printf("  Actual:   %s[%d,%d,%d,%d]=%f, %s[%d,%d,%d,%d]=%f\n",
+                   name, b, h, s, i,                  (float)h_out[i1],
+                   name, b, h, s, i + HALF_ROPE_DIM,  (float)h_out[i2]);
+            return false;
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+int main() {
+  __half* d_q   = nullptr;
+  __half* d_k   = nullptr;
+  __half* d_cos = nullptr;
+  __half* d_sin = nullptr;
+
+  checkCudaErrors(cudaMalloc(&d_q,   Q_SIZE   * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_k,   K_SIZE   * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_cos, COS_SIZE * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_sin, COS_SIZE * sizeof(__half)));
+
+  int threads_per_block = 256;
+  int num_blocks        = (int)((INIT_N + threads_per_block - 1) / threads_per_block);
+
+  initializeInputs<<<num_blocks, threads_per_block>>>(d_q, d_k, d_cos, d_sin);
+  checkCudaErrors(cudaGetLastError());
+
+  /* Snapshot the inputs before the in-place kernel mutates them. */
+  __half* h_q_in = new __half[Q_SIZE];
+  __half* h_k_in = new __half[K_SIZE];
+  __half* h_cos  = new __half[COS_SIZE];
+  __half* h_sin  = new __half[COS_SIZE];
+  checkCudaErrors(cudaMemcpy(h_q_in, d_q,   Q_SIZE   * sizeof(__half), cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(h_k_in, d_k,   K_SIZE   * sizeof(__half), cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(h_cos,  d_cos, COS_SIZE * sizeof(__half), cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(h_sin,  d_sin, COS_SIZE * sizeof(__half), cudaMemcpyDeviceToHost));
+
+  rope<__half, BATCH, Q_HEADS, K_HEADS, BLOCK_QH, BLOCK_KH, BLOCK_HD,
+       HALF_ROPE_DIM, HEAD_DIM, COS_BS, SEQ_LEN>
+      <<<BATCH * SEQ_LEN>>>(d_q, d_k, d_cos, d_sin);
+  checkCudaErrors(cudaGetLastError());
+
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  __half* h_q_out = new __half[Q_SIZE];
+  __half* h_k_out = new __half[K_SIZE];
+  checkCudaErrors(cudaMemcpy(h_q_out, d_q, Q_SIZE * sizeof(__half), cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(h_k_out, d_k, K_SIZE * sizeof(__half), cudaMemcpyDeviceToHost));
+
+  if (!verify(h_q_in, h_q_out, h_cos, h_sin, Q_HEADS, "Q")) return 1;
+  if (!verify(h_k_in, h_k_out, h_cos, h_sin, K_HEADS, "K")) return 1;
+
+  printf("Success! RoPE matches expected results.\n");
+
+  checkCudaErrors(cudaFree(d_q));
+  checkCudaErrors(cudaFree(d_k));
+  checkCudaErrors(cudaFree(d_cos));
+  checkCudaErrors(cudaFree(d_sin));
+
+  delete[] h_q_in;
+  delete[] h_k_in;
+  delete[] h_cos;
+  delete[] h_sin;
+  delete[] h_q_out;
+  delete[] h_k_out;
+}
diff --git a/cpp/9_CUDA_Tile/tileSpMV/CMakeLists.txt b/cpp/9_CUDA_Tile/tileSpMV/CMakeLists.txt
new file mode 100644
index 00000000..8228c74a
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileSpMV/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(tileSpMV LANGUAGES C CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_CUDA_ARCHITECTURES 80 86 87 89 90 100 110 120)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --enable-tile")
+
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+add_executable(tileSpMV tileSpMV.cu)
+
+target_compile_features(tileSpMV PRIVATE cxx_std_20 cuda_std_20)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/9_CUDA_Tile/tileSpMV/README.md b/cpp/9_CUDA_Tile/tileSpMV/README.md
new file mode 100644
index 00000000..63de8a4f
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileSpMV/README.md
@@ -0,0 +1,44 @@
+# tileSpMV
+
+## Description
+
+This sample demonstrates sparse matrix-vector multiplication (SpMV)
+`y = A * x` using CUDA Tile C++.
+
+The matrix is built directly on the host in Sliced ELLPACK (SELL)
+format — the format the Tile kernel actually reads. SELL is the
+same idea as ELLPACK applied per-slice: rows are grouped into
+slices of `ROWS` consecutive rows (sorted by length to minimize
+padding within a slice) and stored column-major so that *the k-th
+nonzero of every row in the slice* occupies a contiguous span of
+`ROWS` elements in memory.
+
+Each CTA processes one slice using a 2D tile of `shape<ROWS, COLS>`:
+
+- Dimension 0 (`ROWS`): the rows of the slice (one tile row per
+  matrix row in the slice)
+- Dimension 1 (`COLS`): the next `COLS` nonzeros of every row in the
+  slice, processed simultaneously
+
+The kernel computes partial products against the x-vector (an
+irreducible gather), accumulates into a 2D tile, reduces along the
+column dimension with `cuda::tiles::sum(acc, 1_ic)` to produce one
+sum per row, and scatters the per-row sums to `y` using the slice
+permutation array.
+
+The sample generates a single random sparse matrix and verifies the
+Tile kernel's output against a CPU reference SpMV.
+
+## Expected Output
+
+```
+Random sparse matrix: rows=100000, cols=100000, nnz=..., avg nnz/row=...
+Tile configuration: ROWS=64, COLS=16 (... slices)
+Success! Tile SpMV matches the CPU reference.
+```
+
+## Prerequisites
+
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) version 13.3 or later.
+- [CUDA Driver](https://www.nvidia.com/en-us/drivers/) version 580 or later.
+- Host compiler with C++20 support.
diff --git a/cpp/9_CUDA_Tile/tileSpMV/tileSpMV.cu b/cpp/9_CUDA_Tile/tileSpMV/tileSpMV.cu
new file mode 100644
index 00000000..453e7f75
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileSpMV/tileSpMV.cu
@@ -0,0 +1,494 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample demonstrates sparse matrix-vector multiplication
+ * (SpMV) using CUDA Tile C++.
+ *
+ *   y = A * x
+ *
+ * The matrix is built directly on the host in Sliced ELLPACK (SELL)
+ * format — the format the Tile kernel actually reads. SELL is the
+ * same idea as ELLPACK applied per-slice: rows are grouped into
+ * slices of SLICE_ROWS consecutive rows (sorted by length to
+ * minimize padding) and stored column-major so that "the k-th
+ * nonzero of every row in the slice" occupies contiguous memory.
+ *
+ *   slice s contains rows row_perm[s*SLICE_ROWS .. s*SLICE_ROWS + SLICE_ROWS)
+ *   sell_data[slice_offsets[s] + k * SLICE_ROWS + r] is the k-th
+ *   nonzero of the r-th row of slice s, with padding (column = 0,
+ *   value = 0.0f) when the row has fewer than slice_widths[s]
+ *   nonzeros
+ *
+ * Each CTA processes one slice using a 2D tile of shape<ROWS, COLS>:
+ *
+ *   - Dimension 0 (ROWS): the rows of the slice (one tile row per
+ *     matrix row in the slice)
+ *   - Dimension 1 (COLS): the next COLS nonzeros of every row in
+ *     the slice, processed simultaneously
+ *
+ * Because of the column-major slice layout, "load the next COLS
+ * nonzeros for every row" is a contiguous, fully coalesced load.
+ * This is the property that distinguishes SELL from a row-by-row
+ * CSR kernel: CSR forces the kernel to gather column indices and
+ * values from disjoint per-row address streams. The kernel computes
+ * partial products against the x-vector (an irreducible gather),
+ * accumulates into a 2D tile, then reduces along the column
+ * dimension with 'cuda::tiles::sum(acc, 1_ic)' to produce one sum
+ * per row, and scatters the sums to y using row_perm.
+ */
+
+#include "helper_cuda.h"
+
+#include "cuda_tile.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdio>
+#include <numeric>
+#include <random>
+#include <vector>
+
+namespace ct = cuda::tiles;
+
+//=============================================================================
+// Tile SpMV kernel: 2D SELL SpMV
+//
+// Each CTA processes one slice of ROWS rows. The inner loop walks
+// the slice's nonzeros COLS at a time. Because the SELL arrays are
+// laid out column-major within a slice, the per-iteration load of
+// 'sell_col_indices' and 'sell_values' is a contiguous load of
+// ROWS*COLS contiguous elements — i.e. perfectly coalesced — even
+// though the underlying rows have wildly different lengths.
+//=============================================================================
+
+template <int ROWS, int COLS, int OCCUPANCY = 8>
+[[cutile::hint(0, occupancy = OCCUPANCY)]]
+__tile_global__ void spmvSell(int num_rows,
+                              const int* __restrict__ sell_col_indices,
+                              const float* __restrict__ sell_values,
+                              const int* __restrict__ slice_offsets,
+                              const int* __restrict__ slice_widths,
+                              const int* __restrict__ row_perm,
+                              const float* __restrict__ vector_x,
+                              float* __restrict__ vector_y) {
+  using namespace ct::literals;
+
+  using Tile2D = ct::tile<float, ct::shape<ROWS, COLS>>;
+  using RowI = ct::tile<int, ct::shape<ROWS>>;
+  using ColI = ct::tile<int, ct::shape<COLS>>;
+
+  sell_col_indices = ct::assume_aligned<16>(sell_col_indices);
+  sell_values = ct::assume_aligned<16>(sell_values);
+  vector_x = ct::assume_aligned<16>(vector_x);
+
+  int slice = ct::bid().x;
+  int row_base = slice * ROWS;
+
+  auto local_row = ct::iota<RowI>();
+  auto row_valid = (row_base + local_row) < num_rows;
+  /* destination row in y for each lane of the slice */
+  auto actual_row = ct::load_masked(row_perm + row_base + local_row,
+                                    row_valid, 0);
+
+  int offset = slice_offsets[slice];
+  int width = slice_widths[slice];
+
+  /* Build 2D index tiles. row_2d broadcasts the in-slice row index
+   * along the column dimension; col_base_2d broadcasts the COLS
+   * lanes along the row dimension. The SELL element address is
+   *   offset + (k + col) * ROWS + r
+   * which becomes the body of the inner loop below. */
+  auto row_2d = ct::broadcast(
+      ct::reshape(local_row, ct::shape<ROWS, 1>{}), ct::shape<ROWS, COLS>{});
+  auto col_base = ct::iota<ColI>();
+  auto row_valid_2d = ct::broadcast(
+      ct::reshape(row_valid, ct::shape<ROWS, 1>{}), ct::shape<ROWS, COLS>{});
+  auto col_base_2d = ct::broadcast(
+      ct::reshape(col_base, ct::shape<1, COLS>{}), ct::shape<ROWS, COLS>{});
+
+  Tile2D acc = ct::zeros<Tile2D>();
+
+  /* Loop-split: 'full_width' iterations need no per-element column
+   * mask (every lane is within 'width'), and the optional trailing
+   * iteration uses a mask. Eliminating the mask from the hot loop
+   * saves a predicate evaluation per element per iteration. */
+  int full_width = (width / COLS) * COLS;
+
+  #pragma unroll 1
+  for (int k = 0; k < full_width; k += COLS) {
+    auto sell_idx = offset + (k + col_base_2d) * ROWS + row_2d;
+    auto cols_2d = ct::load_masked(sell_col_indices + sell_idx,
+                                   row_valid_2d, 0);
+    auto vals_2d = ct::load_masked(sell_values + sell_idx,
+                                   row_valid_2d, 0.0f);
+    auto x_2d = ct::load_masked(vector_x + cols_2d, row_valid_2d, 0.0f);
+    acc = acc + vals_2d * x_2d;
+  }
+
+  if (full_width < width) {
+    auto col_offsets_2d = full_width + col_base_2d;
+    auto valid = row_valid_2d & (col_offsets_2d < width);
+    auto sell_idx = offset + col_offsets_2d * ROWS + row_2d;
+    auto cols_2d = ct::load_masked(sell_col_indices + sell_idx, valid, 0);
+    auto vals_2d = ct::load_masked(sell_values + sell_idx, valid, 0.0f);
+    auto x_2d = ct::load_masked(vector_x + cols_2d, valid, 0.0f);
+    acc = acc + vals_2d * x_2d;
+  }
+
+  /* Reduce along the column dimension to get one sum per slice row,
+   * then scatter to the destination row in y. */
+  auto row_sums = ct::sum(acc, 1_ic);
+  ct::store_masked(vector_y + actual_row,
+                   ct::reshape(row_sums, ct::shape<ROWS>{}), row_valid);
+}
+
+//=============================================================================
+// Tile shape configuration
+//
+// The kernel is templated on (ROWS, COLS); ROWS is also the slice
+// size in the SELL packing. We use a single shape sized for the
+// random matrix generated below (~16 nonzeros per row on average).
+//=============================================================================
+
+constexpr int SLICE_ROWS = 64;
+constexpr int TILE_COLS = 16;
+
+//=============================================================================
+// Sliced ELLPACK (SELL) matrix
+//
+// Layout:
+//   slice s covers row_perm[s*SLICE_ROWS .. s*SLICE_ROWS + SLICE_ROWS)
+//   slice_widths[s]  = max( nnz_per_row[row] for row in slice s )
+//   slice_offsets[s] = sum_{t<s} slice_widths[t] * SLICE_ROWS
+//   sell_col_indices[slice_offsets[s] + k * SLICE_ROWS + r] =
+//       column index of the k-th nonzero of slice s, row r
+//       (or 0 if that slot is padding)
+//   sell_values[...] same but for the value
+//=============================================================================
+
+struct SellMatrix {
+  int num_rows = 0;
+  int num_cols = 0;
+  int num_slices = 0;
+  int nnz_total = 0;                  /* non-padding entries only */
+  std::size_t total_sell_entries = 0; /* including padding */
+
+  std::vector<int> row_perm;       /* size num_slices * SLICE_ROWS */
+  std::vector<int> slice_offsets;  /* size num_slices */
+  std::vector<int> slice_widths;   /* size num_slices */
+  std::vector<int> row_lengths;    /* size num_slices * SLICE_ROWS, padded */
+  std::vector<int> sell_col_indices;
+  std::vector<float> sell_values;
+
+  int* d_row_perm = nullptr;
+  int* d_slice_offsets = nullptr;
+  int* d_slice_widths = nullptr;
+  int* d_sell_col_indices = nullptr;
+  float* d_sell_values = nullptr;
+
+  void uploadToDevice() {
+    checkCudaErrors(cudaMalloc(&d_row_perm, row_perm.size() * sizeof(int)));
+    checkCudaErrors(cudaMalloc(&d_slice_offsets,
+                               slice_offsets.size() * sizeof(int)));
+    checkCudaErrors(cudaMalloc(&d_slice_widths,
+                               slice_widths.size() * sizeof(int)));
+    checkCudaErrors(cudaMalloc(&d_sell_col_indices,
+                               sell_col_indices.size() * sizeof(int)));
+    checkCudaErrors(cudaMalloc(&d_sell_values,
+                               sell_values.size() * sizeof(float)));
+    checkCudaErrors(cudaMemcpy(d_row_perm, row_perm.data(),
+                               row_perm.size() * sizeof(int),
+                               cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_slice_offsets, slice_offsets.data(),
+                               slice_offsets.size() * sizeof(int),
+                               cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_slice_widths, slice_widths.data(),
+                               slice_widths.size() * sizeof(int),
+                               cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_sell_col_indices, sell_col_indices.data(),
+                               sell_col_indices.size() * sizeof(int),
+                               cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_sell_values, sell_values.data(),
+                               sell_values.size() * sizeof(float),
+                               cudaMemcpyHostToDevice));
+  }
+
+  void freeDevice() {
+    if (d_row_perm) checkCudaErrors(cudaFree(d_row_perm));
+    if (d_slice_offsets) checkCudaErrors(cudaFree(d_slice_offsets));
+    if (d_slice_widths) checkCudaErrors(cudaFree(d_slice_widths));
+    if (d_sell_col_indices) checkCudaErrors(cudaFree(d_sell_col_indices));
+    if (d_sell_values) checkCudaErrors(cudaFree(d_sell_values));
+    d_row_perm = nullptr;
+    d_slice_offsets = nullptr;
+    d_slice_widths = nullptr;
+    d_sell_col_indices = nullptr;
+    d_sell_values = nullptr;
+  }
+};
+
+/* Build a SellMatrix from per-row column-index and value lists. Rows
+ * are sorted by ascending length before slicing so that each slice's
+ * longest row is close to its shortest — this minimizes the amount
+ * of zero-padding required inside the slice. */
+static SellMatrix packSell(int num_rows, int num_cols,
+                           const std::vector<int>& row_lengths,
+                           const std::vector<int>& row_cols_flat,
+                           const std::vector<float>& row_vals_flat) {
+  SellMatrix S;
+  S.num_rows = num_rows;
+  S.num_cols = num_cols;
+  S.num_slices = (num_rows + SLICE_ROWS - 1) / SLICE_ROWS;
+
+  /* prefix sums into the flattened row arrays */
+  std::vector<int> prefix(num_rows + 1, 0);
+  for (int r = 0; r < num_rows; ++r) {
+    prefix[r + 1] = prefix[r] + row_lengths[r];
+  }
+  S.nnz_total = prefix[num_rows];
+
+  /* sort row indices by ascending length */
+  std::vector<int> perm(num_rows);
+  std::iota(perm.begin(), perm.end(), 0);
+  std::sort(perm.begin(), perm.end(), [&](int a, int b) {
+    return row_lengths[a] < row_lengths[b];
+  });
+
+  /* pad the permutation up to a whole number of slices; the padding
+   * slots map to row 0, but those lanes are masked out by row_valid
+   * in the kernel and contribute nothing */
+  std::size_t padded_rows =
+      static_cast<std::size_t>(S.num_slices) * SLICE_ROWS;
+  S.row_perm.assign(padded_rows, 0);
+  S.row_lengths.assign(padded_rows, 0);
+  for (int i = 0; i < num_rows; ++i) {
+    S.row_perm[i] = perm[i];
+    S.row_lengths[i] = row_lengths[perm[i]];
+  }
+
+  /* per-slice width = max row length within the slice */
+  S.slice_widths.assign(S.num_slices, 0);
+  for (int s = 0; s < S.num_slices; ++s) {
+    int w = 0;
+    for (int r = 0; r < SLICE_ROWS; ++r) {
+      w = std::max(w, S.row_lengths[s * SLICE_ROWS + r]);
+    }
+    S.slice_widths[s] = w;
+  }
+
+  /* per-slice offset = prefix sum of slice_width * SLICE_ROWS */
+  S.slice_offsets.assign(S.num_slices, 0);
+  std::size_t running = 0;
+  for (int s = 0; s < S.num_slices; ++s) {
+    S.slice_offsets[s] = static_cast<int>(running);
+    running += static_cast<std::size_t>(S.slice_widths[s]) * SLICE_ROWS;
+  }
+  S.total_sell_entries = running;
+
+  /* pack into column-major slice layout */
+  S.sell_col_indices.assign(S.total_sell_entries, 0);
+  S.sell_values.assign(S.total_sell_entries, 0.0f);
+  for (int s = 0; s < S.num_slices; ++s) {
+    int offset = S.slice_offsets[s];
+    for (int r = 0; r < SLICE_ROWS; ++r) {
+      int global_idx = s * SLICE_ROWS + r;
+      if (global_idx >= num_rows) continue;
+      int row = S.row_perm[global_idx];
+      int row_start = prefix[row];
+      int row_len = row_lengths[row];
+      for (int k = 0; k < row_len; ++k) {
+        std::size_t dst = static_cast<std::size_t>(offset)
+                          + static_cast<std::size_t>(k) * SLICE_ROWS + r;
+        S.sell_col_indices[dst] = row_cols_flat[row_start + k];
+        S.sell_values[dst] = row_vals_flat[row_start + k];
+      }
+      /* the remaining k in [row_len, slice_widths[s]) stays as the
+       * zero-init padding written by the assign() calls above */
+    }
+  }
+
+  return S;
+}
+
+//=============================================================================
+// Random matrix generator (produces SellMatrix directly)
+//
+// Each row has a Poisson-distributed number of nonzeros; the column
+// indices within a row are uniform random, then sorted and
+// de-duplicated. packSell() handles the slice layout.
+//=============================================================================
+
+static SellMatrix generateRandom(int num_rows, int num_cols,
+                                 int avg_nnz_per_row, unsigned seed) {
+  std::mt19937 rng(seed);
+  std::uniform_int_distribution<int> col_dist(0, num_cols - 1);
+  std::uniform_real_distribution<float> val_dist(-1.0f, 1.0f);
+  std::poisson_distribution<int> len_dist(static_cast<double>(avg_nnz_per_row));
+
+  std::vector<int> cols_flat;
+  std::vector<float> vals_flat;
+  std::vector<int> row_lengths;
+  row_lengths.reserve(num_rows);
+
+  std::vector<int> scratch_cols;
+  for (int r = 0; r < num_rows; ++r) {
+    int len = std::min(num_cols, std::max(1, len_dist(rng)));
+    scratch_cols.clear();
+    scratch_cols.reserve(len);
+    for (int k = 0; k < len; ++k) {
+      scratch_cols.push_back(col_dist(rng));
+    }
+    std::sort(scratch_cols.begin(), scratch_cols.end());
+    scratch_cols.erase(std::unique(scratch_cols.begin(), scratch_cols.end()),
+                       scratch_cols.end());
+    for (int c : scratch_cols) {
+      cols_flat.push_back(c);
+      vals_flat.push_back(val_dist(rng));
+    }
+    row_lengths.push_back(static_cast<int>(scratch_cols.size()));
+  }
+
+  return packSell(num_rows, num_cols, row_lengths, cols_flat, vals_flat);
+}
+
+//=============================================================================
+// CPU reference SpMV — reads SELL directly so the sample has no
+// dependency on CSR or any external sparse-matrix library.
+//=============================================================================
+
+static void cpuSpMV(const SellMatrix& S, const std::vector<float>& x,
+                    std::vector<float>& y) {
+  y.assign(S.num_rows, 0.0f);
+  for (int s = 0; s < S.num_slices; ++s) {
+    int offset = S.slice_offsets[s];
+    int width = S.slice_widths[s];
+    for (int r = 0; r < SLICE_ROWS; ++r) {
+      int global_idx = s * SLICE_ROWS + r;
+      if (global_idx >= S.num_rows) continue;
+      int dst_row = S.row_perm[global_idx];
+      int row_len = S.row_lengths[global_idx];
+      float sum = 0.0f;
+      for (int k = 0; k < row_len; ++k) {
+        std::size_t src = static_cast<std::size_t>(offset)
+                          + static_cast<std::size_t>(k) * SLICE_ROWS + r;
+        sum += S.sell_values[src] * x[S.sell_col_indices[src]];
+      }
+      y[dst_row] = sum;
+      (void)width;
+    }
+  }
+}
+
+/* Compare device result to the CPU reference. SpMV is performed in
+ * single precision and the device kernel reduces in a different
+ * order than the CPU reference, so we accept differences within a
+ * relative tolerance OR a small absolute tolerance — whichever is
+ * larger. */
+static bool verify(const std::vector<float>& reference,
+                   const std::vector<float>& result,
+                   float rel_tol = 1e-2f, float abs_tol = 1e-4f) {
+  float max_err = 0.0f;
+  int bad_idx = -1;
+  for (std::size_t i = 0; i < reference.size(); ++i) {
+    float diff = std::fabs(reference[i] - result[i]);
+    float allowed = std::max(abs_tol, rel_tol * std::fabs(reference[i]));
+    float over = diff - allowed;
+    if (over > max_err) {
+      max_err = over;
+      bad_idx = static_cast<int>(i);
+    }
+  }
+  if (max_err > 0.0f) {
+    printf("Verification FAILED at index %d (ref=%g, got=%g, diff=%g)\n",
+           bad_idx, reference[bad_idx], result[bad_idx],
+           std::fabs(reference[bad_idx] - result[bad_idx]));
+    return false;
+  }
+  return true;
+}
+
+//=============================================================================
+// Main
+//=============================================================================
+
+int main() {
+  /* Random sparse matrix: ~16 nonzeros per row on average, sized to
+   * match the chosen tile shape (SLICE_ROWS = 64, TILE_COLS = 16). */
+  SellMatrix S = generateRandom(/*num_rows=*/100000, /*num_cols=*/100000,
+                                /*avg_nnz_per_row=*/16, /*seed=*/0xA5A5);
+
+  printf("Random sparse matrix: rows=%d, cols=%d, nnz=%d, "
+         "avg nnz/row=%.1f\n",
+         S.num_rows, S.num_cols, S.nnz_total,
+         static_cast<double>(S.nnz_total) / S.num_rows);
+  printf("Tile configuration: ROWS=%d, COLS=%d (%d slices)\n",
+         SLICE_ROWS, TILE_COLS, S.num_slices);
+
+  /* host inputs */
+  std::vector<float> h_x(S.num_cols);
+  std::mt19937 rng(0xC0FFEE);
+  std::uniform_real_distribution<float> x_dist(-1.0f, 1.0f);
+  for (float& v : h_x) v = x_dist(rng);
+
+  /* CPU reference */
+  std::vector<float> ref_y;
+  cpuSpMV(S, h_x, ref_y);
+
+  /* device allocations */
+  S.uploadToDevice();
+  float* d_x = nullptr;
+  float* d_y = nullptr;
+  checkCudaErrors(cudaMalloc(&d_x, S.num_cols * sizeof(float)));
+  checkCudaErrors(cudaMalloc(&d_y, S.num_rows * sizeof(float)));
+  checkCudaErrors(cudaMemcpy(d_x, h_x.data(), S.num_cols * sizeof(float),
+                             cudaMemcpyHostToDevice));
+
+  /* Launch the SELL Tile kernel: one CTA per slice. */
+  spmvSell<SLICE_ROWS, TILE_COLS><<<S.num_slices>>>(
+      S.num_rows, S.d_sell_col_indices, S.d_sell_values,
+      S.d_slice_offsets, S.d_slice_widths, S.d_row_perm, d_x, d_y);
+  checkCudaErrors(cudaGetLastError());
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  /* copy result back and verify */
+  std::vector<float> h_y(S.num_rows);
+  checkCudaErrors(cudaMemcpy(h_y.data(), d_y, S.num_rows * sizeof(float),
+                             cudaMemcpyDeviceToHost));
+
+  S.freeDevice();
+  checkCudaErrors(cudaFree(d_x));
+  checkCudaErrors(cudaFree(d_y));
+
+  if (!verify(ref_y, h_y)) {
+    return 1;
+  }
+
+  printf("Success! Tile SpMV matches the CPU reference.\n");
+  return 0;
+}
diff --git a/cpp/9_CUDA_Tile/tileTranspose/CMakeLists.txt b/cpp/9_CUDA_Tile/tileTranspose/CMakeLists.txt
new file mode 100644
index 00000000..401c8634
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileTranspose/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(tileTranspose LANGUAGES C CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_CUDA_ARCHITECTURES 80 86 87 89 90 100 110 120)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --enable-tile")
+
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+add_executable(tileTranspose tileTranspose.cu)
+
+target_compile_features(tileTranspose PRIVATE cxx_std_20 cuda_std_20)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/9_CUDA_Tile/tileTranspose/README.md b/cpp/9_CUDA_Tile/tileTranspose/README.md
new file mode 100644
index 00000000..0b2f3613
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileTranspose/README.md
@@ -0,0 +1,21 @@
+# tileTranspose
+
+## Description
+
+This sample demonstrates how to transpose a 2D matrix using CUDA Tile
+C++. Each block handles an n x m sized chunk of the source matrix. The
+block loads a chunk, transposes it locally, and stores it to the
+correct position in the result matrix. A cuda::tiles::partition_view
+is used to model the chunking of the source and result matrices.
+
+## Expected Output
+
+```
+Success! Matrix transpose matches expected results.
+```
+
+## Prerequisites
+
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) version 13.3 or later.
+- [CUDA Driver](https://www.nvidia.com/en-us/drivers/) version 580 or later.
+- Host compiler with C++20 support.
diff --git a/cpp/9_CUDA_Tile/tileTranspose/tileTranspose.cu b/cpp/9_CUDA_Tile/tileTranspose/tileTranspose.cu
new file mode 100644
index 00000000..221ea152
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileTranspose/tileTranspose.cu
@@ -0,0 +1,126 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * This sample demonstrates how to transpose a 2D matrix using CUDA
+ * Tile C++. Each block handles an n x m sized chunk of the source
+ * matrix. The block loads a chunk, transposes it locally, and stores
+ * it to the correct position in the result matrix. A
+ * cuda::tiles::partition_view is used to model the chunking of the
+ * source and result matrices.
+ */
+
+#include "helper_cuda.h"
+#include "cuda_tile.h"
+#include <cstdio>
+
+constexpr int CHUNK_N = 128;
+constexpr int CHUNK_M = 256;
+
+/* Declares a tile kernel with '__restrict__' pointers (important for performance) */
+__tile_global__ void transpose(float* __restrict__ a,
+                               float* __restrict__ b,
+                               std::size_t n,
+                               std::size_t m) {
+  /* set up the namespace */
+  namespace ct = cuda::tiles;
+  using namespace ct::literals;
+
+  /* indicate to the compiler that the pointers are aligned (important for optimizations) */
+  a = ct::assume_aligned(a, 16_ic);
+  b = ct::assume_aligned(b, 16_ic);
+
+  /* get the block index for the x and y dimension */
+  auto [idx, idy, idz] = ct::bid();
+
+  /* create tensor spans representing n x m and m x n row major matrices */
+  ct::tensor_span a_span{a, ct::extents{n, m}};
+  ct::tensor_span b_span{b, ct::extents{m, n}};
+
+  /* create partition views over the arrays */
+  auto view_a = ct::partition_view{a_span, ct::shape<CHUNK_N, CHUNK_M>{}};
+  auto view_b = ct::partition_view{b_span, ct::shape<CHUNK_M, CHUNK_N>{}};
+
+  /* load the tile from the input partition */
+  auto tile_a = view_a.load_masked(idx, idy);
+
+  /* transpose the tile locally */
+  auto tile_transposed = ct::transpose(tile_a);
+
+  /* store the tile to the correct output partition */
+  view_b.store_masked(tile_transposed, idy, idx);
+}
+
+int main() {
+  int n = 800;
+  int m = 400;
+
+  float* h_a = new float[n * m];
+  for (int idx = 0; idx != n * m; ++idx) {
+    h_a[idx] = idx;
+  }
+
+  float* d_a = nullptr;
+  float* d_b = nullptr;
+
+  int num_blocks_n = 1 + (n - 1) / CHUNK_N;
+  int num_blocks_m = 1 + (m - 1) / CHUNK_M;
+
+  checkCudaErrors(cudaMalloc(&d_a, n * m * sizeof(float)));
+  checkCudaErrors(cudaMemcpy(d_a, h_a, n * m * sizeof(float), cudaMemcpyHostToDevice));
+
+  checkCudaErrors(cudaMalloc(&d_b, n * m * sizeof(float)));
+
+  transpose<<<dim3(num_blocks_n, num_blocks_m)>>>(d_a, d_b, n, m);
+  checkCudaErrors(cudaGetLastError());
+
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  float* h_b = new float[n * m];
+  checkCudaErrors(cudaMemcpy(h_b, d_b, n * m * sizeof(float), cudaMemcpyDeviceToHost));
+
+  for (int idx = 0; idx != n; ++idx) {
+    for (int jdx = 0; jdx != m; ++jdx) {
+      float expected = h_a[idx * m + jdx];
+      float actual = h_b[jdx * n + idx];
+      if (expected != actual) {
+        printf("Expected: h_b[%i][%i] == %f\n", jdx, idx, expected);
+        printf("Actual:   h_b[%i][%i] == %f\n", jdx, idx, actual);
+
+        return 1;
+      }
+    }
+  }
+
+  printf("Success! Matrix transpose matches expected results.\n");
+
+  checkCudaErrors(cudaFree(d_a));
+  checkCudaErrors(cudaFree(d_b));
+
+  delete[] h_a;
+  delete[] h_b;
+}
diff --git a/cpp/9_CUDA_Tile/tileVectorAdd/CMakeLists.txt b/cpp/9_CUDA_Tile/tileVectorAdd/CMakeLists.txt
new file mode 100644
index 00000000..a7f9ab2f
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileVectorAdd/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(tileVectorAdd LANGUAGES C CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_CUDA_ARCHITECTURES 80 86 87 89 90 100 110 120)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --enable-tile")
+
+if(ENABLE_CUDA_DEBUG)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")
+else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+endif()
+
+# Include directories and libraries
+include_directories(../../../Common)
+
+# Source file
+add_executable(tileVectorAdd tileVectorAdd.cu)
+
+target_compile_features(tileVectorAdd PRIVATE cxx_std_20 cuda_std_20)
+
+# Include installation configuration
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/InstallSamples.cmake)
+setup_samples_install()
diff --git a/cpp/9_CUDA_Tile/tileVectorAdd/README.md b/cpp/9_CUDA_Tile/tileVectorAdd/README.md
new file mode 100644
index 00000000..f9fcf899
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileVectorAdd/README.md
@@ -0,0 +1,24 @@
+# tileVectorAdd
+
+## Description
+
+This sample demonstrates a simple vector addition using CUDA Tile C++.
+The vector addition is performed by splitting the dataset into blocks
+which process 1024 elements at a time. The cuda::tiles::partition_view
+type is used to partition the data into chunks of size 1024. Each
+block loads its respective chunk from 'a' and 'b', performs an
+elementwise addition, then stores it to the corresponding chunk of
+'c'. Masked loads and stores are used to ensure that the last chunk
+which is partially out of bounds is correctly handled.
+
+## Expected Output
+
+```
+Success! Vector addition matches expected results.
+```
+
+## Prerequisites
+
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) version 13.3 or later.
+- [CUDA Driver](https://www.nvidia.com/en-us/drivers/) version 580 or later.
+- Host compiler with C++20 support.
diff --git a/cpp/9_CUDA_Tile/tileVectorAdd/tileVectorAdd.cu b/cpp/9_CUDA_Tile/tileVectorAdd/tileVectorAdd.cu
new file mode 100644
index 00000000..4c678758
--- /dev/null
+++ b/cpp/9_CUDA_Tile/tileVectorAdd/tileVectorAdd.cu
@@ -0,0 +1,136 @@
+/* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample demonstrates a simple vector addition using CUDA Tile C++.
+ * The vector addition is performed by splitting the dataset into blocks
+ * which process 1024 elements at a time. The cuda::tiles::partition_view
+ * type is used to partition the data into chunks of size 1024. Each block loads
+ * its respective chunk from 'a' and 'b', performs an elementwise addition,
+ * then stores it to the corresponding chunk of 'c'. Masked loads and stores
+ * are used to ensure that the last chunk which is partially out of bounds is
+ * correctly handled.
+ *
+ * A SIMT kernel is used to initialize the input vectors.
+ */
+
+#include "helper_cuda.h"
+
+#include "cuda_tile.h"
+#include "cuda_fp16.h"
+
+#include <cstdio>
+
+__global__ void initializeVectors(__half* a, __half* b, std::size_t n) {
+  auto idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < n) {
+    a[idx] = __half{0.5 * idx};
+    b[idx] = __half{1.5 * idx};
+  }
+}
+
+/* Declares a tile kernel with '__restrict__' pointers (important for performance) */
+__tile_global__ void vectorAdd(__half* __restrict__ a,
+                               __half* __restrict__ b,
+                               __half* __restrict__ c,
+                               std::size_t n) {
+
+  /* set up the namespace */
+  namespace ct = cuda::tiles;
+  using namespace ct::literals;
+
+  /* indicate to the compiler that the pointers are aligned (important for optimizations) */
+  a = ct::assume_aligned(a, 16_ic);
+  b = ct::assume_aligned(b, 16_ic);
+  c = ct::assume_aligned(c, 16_ic);
+
+  /* get the block index in the x dimension */
+  auto idx = ct::bid().x;
+
+  /* create tensor spans representing arrays of length 'n' based on the points 'a', 'b', and 'c' */
+  ct::tensor_span a_span{a, ct::extents{n}};
+  ct::tensor_span b_span{b, ct::extents{n}};
+  ct::tensor_span c_span{c, ct::extents{n}};
+
+  /* create partition views over the full arrays, partitioned into chunks of 1024 */
+  auto view_a = ct::partition_view{a_span, ct::shape{1024_ic}};
+  auto view_b = ct::partition_view{b_span, ct::shape{1024_ic}};
+  auto view_c = ct::partition_view{c_span, ct::shape{1024_ic}};
+
+  /* load the tiles from the input partitions */
+  auto tile_a = view_a.load_masked(idx);
+  auto tile_b = view_b.load_masked(idx);
+
+  /* add the tiles together, elementwise */
+  auto tile_c = tile_a + tile_b;
+
+  /* store the result tile to the output partition */
+  view_c.store_masked(tile_c, idx);
+}
+
+int main() {
+  __half* d_a = nullptr;
+  __half* d_b = nullptr;
+  __half* d_c = nullptr;
+
+  int N = 8000;
+  int chunk_size = 1024;
+  int num_blocks = 1 + ((N - 1) / chunk_size);
+
+  checkCudaErrors(cudaMalloc(&d_a, N * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_b, N * sizeof(__half)));
+  checkCudaErrors(cudaMalloc(&d_c, N * sizeof(__half)));
+
+  initializeVectors<<<num_blocks, chunk_size>>>(d_a, d_b, N);
+  checkCudaErrors(cudaGetLastError());
+
+  vectorAdd<<<num_blocks>>>(d_a, d_b, d_c, N);
+  checkCudaErrors(cudaGetLastError());
+
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  __half* h_c = new __half[N];
+  checkCudaErrors(cudaMemcpy(h_c, d_c, N * sizeof(__half), cudaMemcpyDeviceToHost));
+
+  for (int idx = 0; idx != N; ++idx) {
+    if (h_c[idx] != __half{2 * idx}) {
+      printf("Expected: h_c[%i] == %i\n", idx, 2 * idx);
+      printf("Actual:   h_c[%i] == %f\n", idx, float(h_c[idx]));
+
+      return 1;
+    }
+  }
+
+  printf("Success! Vector addition matches expected results.\n");
+
+  checkCudaErrors(cudaFree(d_a));
+  checkCudaErrors(cudaFree(d_b));
+  checkCudaErrors(cudaFree(d_c));
+
+  delete[] h_c;
+}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3f7a8f3c..b5144219 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -31,3 +31,6 @@ if(BUILD_TEGRA)
     set(CMAKE_FOLDER "8_Platform_Specific/Tegra")
     add_subdirectory(8_Platform_Specific/Tegra)
 endif()
+
+set(CMAKE_FOLDER "9_CUDA_Tile")
+add_subdirectory(9_CUDA_Tile)
diff --git a/python/1_GettingStarted/blurImageUnifiedMemory/README.md b/python/1_GettingStarted/blurImageUnifiedMemory/README.md
index 7f228dfe..93bdd26d 100644
--- a/python/1_GettingStarted/blurImageUnifiedMemory/README.md
+++ b/python/1_GettingStarted/blurImageUnifiedMemory/README.md
@@ -106,10 +106,19 @@ When returning a zero-copy view, the caller must close the buffers after use (e.
 - CUDA Toolkit 13.0 or newer
 - Python 3.10 or newer
 - `cuda-python` package (13.0.0+)
-- `cuda-core` package (>=0.6.0)
+- `cuda-core` package (>=1.0.0)
 - `numpy` package (>=2.3.2)
 - `pillow` package (10.0.0+)
 
+### Platform Support:
+
+This sample relies on `ManagedMemoryResource` with **concurrent host access**
+to managed allocations while GPU kernels are in flight. That behavior
+requires the device property `concurrent_managed_access=True`, which is only
+supported on Linux with HMM (Pascal and newer). On Windows (WDDM/MCDM/TCC)
+the property is `False`, so the sample exits early with a waive message and
+exit code `2` instead of attempting a run that would crash the process.
+
 ## Installation
 
 ```bash
diff --git a/python/1_GettingStarted/blurImageUnifiedMemory/blurImageUnifiedMemory.py b/python/1_GettingStarted/blurImageUnifiedMemory/blurImageUnifiedMemory.py
index 254f5056..20a05d2e 100644
--- a/python/1_GettingStarted/blurImageUnifiedMemory/blurImageUnifiedMemory.py
+++ b/python/1_GettingStarted/blurImageUnifiedMemory/blurImageUnifiedMemory.py
@@ -142,8 +142,8 @@ def blur_image_unified_memory(
     mr = ManagedMemoryResource(options)
 
     # Allocate unified memory buffers for source and destination images
-    src_buf = mr.allocate(n_bytes, stream)
-    dst_buf = mr.allocate(n_bytes, stream)
+    src_buf = mr.allocate(n_bytes, stream=stream)
+    dst_buf = mr.allocate(n_bytes, stream=stream)
     try:
         # Synchronize to ensure allocations are complete before CPU access
         stream.sync()
@@ -197,6 +197,14 @@ def main():
     3. Unified memory with cuda.core.ManagedMemoryResource
     4. Kernel launch with cuda.core.launch and LaunchConfig
     """
+    if sys.platform == "win32":
+        print(
+            "This sample relies on ManagedMemoryResource with concurrent host "
+            "access, which is not supported on Windows "
+            "(concurrent_managed_access=False). Waiving this sample."
+        )
+        sys.exit(2)
+
     print("=" * 60)
     print("Image Blur with Unified Memory (cuda.core)")
     print("=" * 60)
diff --git a/python/1_GettingStarted/blurImageUnifiedMemory/requirements.txt b/python/1_GettingStarted/blurImageUnifiedMemory/requirements.txt
index c7a8608e..5213dcc2 100644
--- a/python/1_GettingStarted/blurImageUnifiedMemory/requirements.txt
+++ b/python/1_GettingStarted/blurImageUnifiedMemory/requirements.txt
@@ -1,6 +1,6 @@
 # Image Blur with Unified Memory Sample Requirements
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
 numpy>=2.3.2
 pillow>=10.0.0
diff --git a/python/1_GettingStarted/copyImageArraytoGPU/README.md b/python/1_GettingStarted/copyImageArraytoGPU/README.md
index f5b426d2..46120af2 100644
--- a/python/1_GettingStarted/copyImageArraytoGPU/README.md
+++ b/python/1_GettingStarted/copyImageArraytoGPU/README.md
@@ -39,7 +39,7 @@ Copy image arrays between CPU and GPU memory using the modern `cuda.core` API wi
 ### From `cupy`:
 
 - `cp.from_dlpack()` - Create GPU array view from DLPack capsule
-- `cp.cuda.ExternalStream()` - Use external CUDA stream
+- `cp.cuda.Stream.from_external()` - Use external CUDA stream
 
 ### From `cuda_samples_utils`:
 
@@ -58,8 +58,8 @@ Copy image arrays between CPU and GPU memory using the modern `cuda.core` API wi
 - Python 3.10 or newer
 - NumPy 2.3.2 or newer (required for DLPack support)
 - `cuda-python` package (>=13.0.0+)
-- `cuda-core` package (>=0.6.0)
-- `cupy-cuda13x` package (13.0.0+)
+- `cuda-core` package (>=1.0.0)
+- `cupy-cuda13x` package (14.0.0+)
 
 ## Installation
 
@@ -73,8 +73,8 @@ pip install -r requirements.txt
 The requirements.txt installs:
 - `numpy` (2.3.2+, required for DLPack)
 - `cuda-python` (>=13.0.0+)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (13.0.0+)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (14.0.0+)
 
 ## How to Run
 
diff --git a/python/1_GettingStarted/copyImageArraytoGPU/copyImageArraytoGPU.py b/python/1_GettingStarted/copyImageArraytoGPU/copyImageArraytoGPU.py
index d78a7b6a..310fc361 100644
--- a/python/1_GettingStarted/copyImageArraytoGPU/copyImageArraytoGPU.py
+++ b/python/1_GettingStarted/copyImageArraytoGPU/copyImageArraytoGPU.py
@@ -195,7 +195,7 @@ def main():
     print(f"[Image array copy of {H}x{W}x{C} image]")
 
     # Step 2: Configure CuPy to use our CUDA stream (for interoperability)
-    cp.cuda.ExternalStream(int(stream.handle)).use()
+    cp.cuda.Stream.from_external(stream).use()
 
     # Step 3: Create a test image on CPU
     print("Creating sample image...")
diff --git a/python/1_GettingStarted/copyImageArraytoGPU/requirements.txt b/python/1_GettingStarted/copyImageArraytoGPU/requirements.txt
index 4e85c082..31aed254 100644
--- a/python/1_GettingStarted/copyImageArraytoGPU/requirements.txt
+++ b/python/1_GettingStarted/copyImageArraytoGPU/requirements.txt
@@ -2,5 +2,5 @@
 
 numpy>=2.3.2
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
diff --git a/python/1_GettingStarted/deviceQuery/README.md b/python/1_GettingStarted/deviceQuery/README.md
index 52c3797a..17e639c3 100644
--- a/python/1_GettingStarted/deviceQuery/README.md
+++ b/python/1_GettingStarted/deviceQuery/README.md
@@ -90,7 +90,7 @@ Query and display detailed properties of all CUDA-capable devices in your system
 - CUDA Toolkit 13.0 or newer (recommended; matches `cuda-python` 13.x)
 - Python 3.10 or newer
 - `cuda-python` package (>=13.0.0)
-- `cuda-core` package (>=0.6.0)
+- `cuda-core` package (>=1.0.0)
 
 ## Installation
 
@@ -103,7 +103,7 @@ pip install -r requirements.txt
 
 The requirements.txt installs:
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
+- `cuda-core` (>=1.0.0)
 
 ## How to Run
 
diff --git a/python/1_GettingStarted/deviceQuery/deviceQuery.py b/python/1_GettingStarted/deviceQuery/deviceQuery.py
index 06285ca4..93b9078a 100755
--- a/python/1_GettingStarted/deviceQuery/deviceQuery.py
+++ b/python/1_GettingStarted/deviceQuery/deviceQuery.py
@@ -138,7 +138,7 @@ def print_device_info(dev_id, device):
     print(f"Device {dev_id}: {device.name}")
 
     # cuda.bindings workaround: runtime version not in cuda.core
-    driver_major, driver_minor = system.get_driver_version()
+    driver_major, driver_minor = system.get_user_mode_driver_version()
     err, runtime_version = cudart.cudaRuntimeGetVersion()
     if err != cudart.cudaError_t.cudaSuccess:
         raise RuntimeError(f"Failed to get CUDA runtime version: {err}")
diff --git a/python/1_GettingStarted/deviceQuery/requirements.txt b/python/1_GettingStarted/deviceQuery/requirements.txt
index 9da207a3..a0e4feab 100644
--- a/python/1_GettingStarted/deviceQuery/requirements.txt
+++ b/python/1_GettingStarted/deviceQuery/requirements.txt
@@ -1,4 +1,4 @@
 # Device Query Sample Requirements
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
diff --git a/python/1_GettingStarted/kernelNsysProfile/requirements.txt b/python/1_GettingStarted/kernelNsysProfile/requirements.txt
index 8145c408..e86d7bbf 100644
--- a/python/1_GettingStarted/kernelNsysProfile/requirements.txt
+++ b/python/1_GettingStarted/kernelNsysProfile/requirements.txt
@@ -2,6 +2,6 @@
 
 numpy>=2.3.2
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
 nvtx
diff --git a/python/1_GettingStarted/numpyVsCupy/numpyVsCupy.py b/python/1_GettingStarted/numpyVsCupy/numpyVsCupy.py
index 9a51cfc2..31f79b1e 100644
--- a/python/1_GettingStarted/numpyVsCupy/numpyVsCupy.py
+++ b/python/1_GettingStarted/numpyVsCupy/numpyVsCupy.py
@@ -57,7 +57,7 @@ def timer(message):
 @contextlib.contextmanager
 def gpu_timer(message, stream):
     """GPU timing context manager using cuda.core CUDA events."""
-    event_options = EventOptions(enable_timing=True)
+    event_options = EventOptions(timing_enabled=True)
     start_event = stream.record(options=event_options)
     yield
     end_event = stream.record(options=event_options)
diff --git a/python/1_GettingStarted/numpyVsCupy/requirements.txt b/python/1_GettingStarted/numpyVsCupy/requirements.txt
index bd5f9171..c895afaa 100644
--- a/python/1_GettingStarted/numpyVsCupy/requirements.txt
+++ b/python/1_GettingStarted/numpyVsCupy/requirements.txt
@@ -3,5 +3,5 @@
 
 numpy>=2.3.2
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
diff --git a/python/1_GettingStarted/simplePrint/README.md b/python/1_GettingStarted/simplePrint/README.md
index 72a7d01d..350694b0 100644
--- a/python/1_GettingStarted/simplePrint/README.md
+++ b/python/1_GettingStarted/simplePrint/README.md
@@ -68,7 +68,7 @@ CUDA Python (cuda.core), Numba CUDA, Kernel Compilation, Printf in Kernels, Mult
 - CUDA Toolkit 13.0 or newer
 - Python 3.10 or newer
 - `cuda-python` package (13.0+)
-- `cuda-core` package (>=0.6.0)
+- `cuda-core` package (>=1.0.0)
 - `numba-cuda` package (0.24.0+, for Pythonic kernel authoring)
 
 Download and install:
diff --git a/python/1_GettingStarted/simplePrint/requirements.txt b/python/1_GettingStarted/simplePrint/requirements.txt
index 7f53de68..a39c5e7e 100644
--- a/python/1_GettingStarted/simplePrint/requirements.txt
+++ b/python/1_GettingStarted/simplePrint/requirements.txt
@@ -1,7 +1,7 @@
 # Simple Printf Sample - Requirements
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
 # Numba JIT uses nvJitLink from pip; keep in step with cuda-bindings (e.g. 13.2.x).
 nvidia-nvjitlink>=13.2.0
-numba-cuda>=0.24.0
+numba-cuda>=0.29.0
diff --git a/python/1_GettingStarted/systemInfo/README.md b/python/1_GettingStarted/systemInfo/README.md
index 6e15f04f..2e931964 100644
--- a/python/1_GettingStarted/systemInfo/README.md
+++ b/python/1_GettingStarted/systemInfo/README.md
@@ -63,7 +63,7 @@ Import stable symbols from the top-level `cuda.core` package (not `cuda.core.exp
 - CUDA Toolkit 13.0 or newer (matches `cuda-python` 13.x)
 - Python 3.10 or newer
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
+- `cuda-core` (>=1.0.0)
 
 ## Installation
 
@@ -77,7 +77,7 @@ pip install -r requirements.txt
 The `requirements.txt` installs:
 
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
+- `cuda-core` (>=1.0.0)
 
 ## How to Run
 
diff --git a/python/1_GettingStarted/systemInfo/requirements.txt b/python/1_GettingStarted/systemInfo/requirements.txt
index 79fef8ab..13628c0b 100644
--- a/python/1_GettingStarted/systemInfo/requirements.txt
+++ b/python/1_GettingStarted/systemInfo/requirements.txt
@@ -1,4 +1,4 @@
 # System Information Sample Requirements
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
diff --git a/python/1_GettingStarted/systemInfo/systemInfo.py b/python/1_GettingStarted/systemInfo/systemInfo.py
index dd4b28ec..d5944d35 100644
--- a/python/1_GettingStarted/systemInfo/systemInfo.py
+++ b/python/1_GettingStarted/systemInfo/systemInfo.py
@@ -43,11 +43,8 @@ import sys
 
 try:
     from cuda.core import system
-    from cuda.core.system import (
-        CUDA_BINDINGS_NVML_IS_COMPATIBLE,
-        GpuP2PCapsIndex,
-        TemperatureSensors,
-    )
+    from cuda.core.system import CUDA_BINDINGS_NVML_IS_COMPATIBLE
+    from cuda.core.system.typing import GpuP2PCapsIndex
 except ImportError as e:
     print(f"Error: Required package not found: {e}")
     print("Please install from requirements.txt:")
@@ -75,10 +72,11 @@ def format_bytes(nbytes: int) -> str:
 
 def print_driver_info() -> None:
     print_header("Driver / NVML")
-    major, minor = system.get_driver_version()
-    print(f"CUDA driver version: {major}.{minor}")
-    print(f"CUDA driver version (full): {system.get_driver_version_full()}")
+    major, minor = system.get_user_mode_driver_version()
+    print(f"CUDA driver version (user-mode): {major}.{minor}")
     if CUDA_BINDINGS_NVML_IS_COMPATIBLE:
+        kmd = system.get_kernel_mode_driver_version()
+        print(f"CUDA driver version (kernel-mode): {'.'.join(str(x) for x in kmd)}")
         print(f"NVML version: {system.get_nvml_version()}")
         try:
             print(f"Driver branch: {system.get_driver_branch()}")
@@ -106,7 +104,7 @@ def print_device_info(device: "system.Device") -> None:
     except Exception as e:  # noqa: BLE001
         print(f"Architecture: unavailable ({e})")
     try:
-        print(f"Brand: {device.brand.name}")
+        print(f"Brand: {device.brand}")
     except Exception as e:  # noqa: BLE001
         print(f"Brand: unavailable ({e})")
 
@@ -133,7 +131,7 @@ def print_device_info(device: "system.Device") -> None:
 
     # Temperature (GPU sensor)
     try:
-        temp_c = device.temperature.sensor(TemperatureSensors.TEMPERATURE_GPU)
+        temp_c = device.temperature.get_sensor()
         print(f"Temperature (GPU sensor): {temp_c} C")
     except Exception as e:  # noqa: BLE001
         print(f"Temperature: unavailable ({e})")
@@ -158,12 +156,8 @@ def print_topology(devices: list) -> None:
             except Exception as e:  # noqa: BLE001
                 level_name = f"unavailable ({e})"
             try:
-                read = system.get_p2p_status(
-                    d0, d1, GpuP2PCapsIndex.P2P_CAPS_INDEX_READ
-                )
-                write = system.get_p2p_status(
-                    d0, d1, GpuP2PCapsIndex.P2P_CAPS_INDEX_WRITE
-                )
+                read = system.get_p2p_status(d0, d1, GpuP2PCapsIndex.READ)
+                write = system.get_p2p_status(d0, d1, GpuP2PCapsIndex.WRITE)
                 read_name = read.name
                 write_name = write.name
             except Exception as e:  # noqa: BLE001
diff --git a/python/1_GettingStarted/vectorAdd/README.md b/python/1_GettingStarted/vectorAdd/README.md
index 6ceee879..8abc4728 100644
--- a/python/1_GettingStarted/vectorAdd/README.md
+++ b/python/1_GettingStarted/vectorAdd/README.md
@@ -58,8 +58,8 @@ Import stable symbols from the top-level package (not `cuda.core.experimental`).
 - CUDA Toolkit 13.0 or newer (matches `cuda-python` 13.x)
 - Python 3.10 or newer
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
 
 ## Installation
 
@@ -73,8 +73,8 @@ pip install -r requirements.txt
 The requirements.txt installs:
 
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
 
 ## How to Run
 
diff --git a/python/1_GettingStarted/vectorAdd/requirements.txt b/python/1_GettingStarted/vectorAdd/requirements.txt
index 06b950e4..46d588ce 100644
--- a/python/1_GettingStarted/vectorAdd/requirements.txt
+++ b/python/1_GettingStarted/vectorAdd/requirements.txt
@@ -1,5 +1,5 @@
 # Vector Addition Sample Requirements
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
diff --git a/python/2_CoreConcepts/binarySearch/README.md b/python/2_CoreConcepts/binarySearch/README.md
new file mode 100644
index 00000000..4bbddcec
--- /dev/null
+++ b/python/2_CoreConcepts/binarySearch/README.md
@@ -0,0 +1,129 @@
+# binarySearch (Python)
+
+## Description
+
+This sample demonstrates the parallel binary-search algorithms
+exposed by **cuda.compute** (from the `cuda-cccl` package). Given
+a sorted `d_data` array and a batch of `d_values` to locate, one
+device-wide call returns the insertion index for every value:
+
+- `cuda.compute.lower_bound` writes, for each value, the lowest index
+  where it could be inserted into `d_data` without breaking the sort
+  order. Equivalent to `numpy.searchsorted(..., side="left")`.
+- `cuda.compute.upper_bound` is the analogous upper form, equivalent
+  to `numpy.searchsorted(..., side="right")`.
+
+The sample runs both algorithms on two curated inputs: one with
+distinct elements (where `lower_bound` and `upper_bound` agree on
+any value not in the data) and one with duplicates (where they
+diverge on present values). Results are verified against
+`numpy.searchsorted`.
+
+## What You'll Learn
+
+- How to call `cuda.compute.lower_bound` / `upper_bound` with CuPy
+  arrays
+- The semantic difference between `lower_bound` and `upper_bound`,
+  especially for inputs containing duplicates
+- How the output dtype (`np.uintp`) is used for indices
+
+## Key Libraries
+
+- [`cuda.compute`](https://nvidia.github.io/cccl/python.html) (from the `cuda-cccl` package) - device algorithms
+- [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) - device setup
+- `cupy` - device buffers
+- `numpy` - host-side reference via `numpy.searchsorted`
+
+## Key APIs
+
+### From `cuda.compute`
+
+- `cuda.compute.lower_bound(d_data, num_items, d_values, num_values, d_out)`
+- `cuda.compute.upper_bound(d_data, num_items, d_values, num_values, d_out)`
+
+### From `cuda_samples_utils`
+
+- `print_gpu_info()` - print device name and compute capability
+
+## Requirements
+
+### Hardware
+
+- NVIDIA GPU with Compute Capability 7.0 or higher
+- Minimum GPU memory: 512 MB
+
+### Software
+
+- CUDA Toolkit 13.0 or newer
+- Python 3.10 or newer
+- `cuda-cccl` (>=1.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
+
+If the CUDA toolkit is not on your `PATH`, set `CUDA_HOME` so that
+cuda.compute's JIT path can locate its dependencies:
+
+```bash
+export CUDA_HOME=/usr/local/cuda
+```
+
+## Installation
+
+Install the required packages from `requirements.txt`:
+
+```bash
+cd /path/to/cuda-samples/python/2_CoreConcepts/binarySearch
+pip install -r requirements.txt
+```
+
+The `requirements.txt` installs:
+
+- `cuda-cccl` (>=1.0.0) - ships the `cuda.compute` module
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
+- `numpy` (>=1.24.0)
+
+## How to Run
+
+### Basic usage
+
+```bash
+cd cuda-samples/python/2_CoreConcepts/binarySearch
+python binarySearch.py
+```
+
+### With custom parameters
+
+```bash
+python binarySearch.py --device 1
+```
+
+## Expected Output
+
+```
+Device: <Your GPU Name>
+Compute Capability: <X.Y>
+
+Case 1: distinct data, mixed queries
+  data    = [1, 3, 5, 7, 9]
+  values  = [0, 3, 4, 10]
+  lower_bound: got [0, 1, 2, 5]  expected [0, 1, 2, 5]  OK
+  upper_bound: got [0, 2, 2, 5]  expected [0, 2, 2, 5]  OK
+
+Case 2: duplicates in data
+  data    = [1, 3, 3, 5, 7, 9]
+  values  = [3, 3, 5, 8]
+  lower_bound: got [1, 1, 3, 5]  expected [1, 1, 3, 5]  OK
+  upper_bound: got [3, 3, 4, 5]  expected [3, 3, 4, 5]  OK
+
+Done
+```
+
+**Note:** Device name and compute capability will vary based on your GPU.
+
+## Files
+
+- `binarySearch.py` - Python implementation
+- `README.md` - This file
+- `requirements.txt` - Sample dependencies
+- `../../Utilities/cuda_samples_utils.py` - Common utilities (imported by this sample)
diff --git a/python/2_CoreConcepts/binarySearch/binarySearch.py b/python/2_CoreConcepts/binarySearch/binarySearch.py
new file mode 100644
index 00000000..00e54a9c
--- /dev/null
+++ b/python/2_CoreConcepts/binarySearch/binarySearch.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+This sample demonstrates the parallel binary-search algorithms exposed
+by cuda.compute (from the cuda-cccl package). Given a sorted
+``d_data`` array and a batch of ``d_values`` to locate, cuda.compute:
+
+  - ``cuda.compute.lower_bound(d_data, num_items, d_values, num_values, d_out)``
+    writes, for each value, the lowest index where it could be inserted
+    into d_data without breaking the sort order. Matches
+    ``numpy.searchsorted(..., side="left")``.
+
+  - ``cuda.compute.upper_bound(d_data, num_items, d_values, num_values, d_out)``
+    is the analogous upper form, matching ``side="right"``.
+
+The sample runs both algorithms on a curated sorted input with
+duplicates so the lower/upper distinction is visible, verifies the
+results against ``numpy.searchsorted``, and prints both sets of
+indices side-by-side.
+"""
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
+
+try:
+    import cuda.compute
+    import cupy as cp
+    import numpy as np
+    from cuda.core import Device
+    from cuda_samples_utils import print_gpu_info  # noqa: E402
+except ImportError as e:
+    print(f"Error: Required package not found: {e}")
+    print("Please install from requirements.txt:")
+    print("  pip install -r requirements.txt")
+    sys.exit(1)
+
+
+def run_binary_search(h_data: np.ndarray, h_values: np.ndarray) -> bool:
+    d_data = cp.asarray(h_data)
+    d_values = cp.asarray(h_values)
+
+    d_lb = cp.empty(len(h_values), dtype=np.uintp)
+    d_ub = cp.empty(len(h_values), dtype=np.uintp)
+
+    cuda.compute.lower_bound(
+        d_data=d_data,
+        num_items=len(d_data),
+        d_values=d_values,
+        num_values=len(d_values),
+        d_out=d_lb,
+    )
+    cuda.compute.upper_bound(
+        d_data=d_data,
+        num_items=len(d_data),
+        d_values=d_values,
+        num_values=len(d_values),
+        d_out=d_ub,
+    )
+
+    got_lb = cp.asnumpy(d_lb)
+    got_ub = cp.asnumpy(d_ub)
+    expected_lb = np.searchsorted(h_data, h_values, side="left").astype(np.uintp)
+    expected_ub = np.searchsorted(h_data, h_values, side="right").astype(np.uintp)
+
+    ok_lb = np.array_equal(got_lb, expected_lb)
+    ok_ub = np.array_equal(got_ub, expected_ub)
+
+    print(f"  data    = {h_data.tolist()}")
+    print(f"  values  = {h_values.tolist()}")
+    print(
+        f"  lower_bound: got {got_lb.tolist()}  "
+        f"expected {expected_lb.tolist()}  {'OK' if ok_lb else 'FAIL'}"
+    )
+    print(
+        f"  upper_bound: got {got_ub.tolist()}  "
+        f"expected {expected_ub.tolist()}  {'OK' if ok_ub else 'FAIL'}"
+    )
+    return ok_lb and ok_ub
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Parallel upper_bound / lower_bound via cuda.compute"
+    )
+    parser.add_argument("--device", type=int, default=0, help="CUDA device id")
+    args = parser.parse_args()
+
+    device = Device(args.device)
+    device.set_current()
+    print_gpu_info(device)
+    print()
+
+    ok = True
+
+    # Case 1: values both inside and outside the data range; no duplicates
+    # in the data. lower_bound and upper_bound agree on values not present.
+    print("Case 1: distinct data, mixed queries")
+    h_data1 = np.array([1, 3, 5, 7, 9], dtype=np.int32)
+    h_values1 = np.array([0, 3, 4, 10], dtype=np.int32)
+    ok &= run_binary_search(h_data1, h_values1)
+    print()
+
+    # Case 2: duplicates in the data so lower_bound and upper_bound diverge
+    # on present values.
+    print("Case 2: duplicates in data")
+    h_data2 = np.array([1, 3, 3, 5, 7, 9], dtype=np.int32)
+    h_values2 = np.array([3, 3, 5, 8], dtype=np.int32)
+    ok &= run_binary_search(h_data2, h_values2)
+
+    print()
+    if ok:
+        print("Done")
+        return 0
+    print("FAILED")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/python/2_CoreConcepts/binarySearch/requirements.txt b/python/2_CoreConcepts/binarySearch/requirements.txt
new file mode 100644
index 00000000..3110a76e
--- /dev/null
+++ b/python/2_CoreConcepts/binarySearch/requirements.txt
@@ -0,0 +1,4 @@
+cuda-cccl[cu13]>=1.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
+numpy>=1.24.0
diff --git a/python/2_CoreConcepts/blockwiseSum/blockwiseSum.py b/python/2_CoreConcepts/blockwiseSum/blockwiseSum.py
index 8471bd14..7075d186 100644
--- a/python/2_CoreConcepts/blockwiseSum/blockwiseSum.py
+++ b/python/2_CoreConcepts/blockwiseSum/blockwiseSum.py
@@ -139,7 +139,7 @@ def run_sample(num_elements: int = 1024 * 1024, device_id: int = 0) -> bool:
 
     try:
         # Make CuPy use our stream
-        cp.cuda.ExternalStream(int(stream.handle)).use()
+        cp.cuda.Stream.from_external(stream).use()
 
         # Compile kernels
         program = Program(
@@ -216,7 +216,7 @@ def run_sample(num_elements: int = 1024 * 1024, device_id: int = 0) -> bool:
         test3 = verify_array_result(d_partial, expected_partial)
 
         # Performance timing
-        event_opts = EventOptions(enable_timing=True)
+        event_opts = EventOptions(timing_enabled=True)
         iterations = 100
 
         stream.sync()
diff --git a/python/2_CoreConcepts/blockwiseSum/requirements.txt b/python/2_CoreConcepts/blockwiseSum/requirements.txt
index f7bdef83..e70c8635 100644
--- a/python/2_CoreConcepts/blockwiseSum/requirements.txt
+++ b/python/2_CoreConcepts/blockwiseSum/requirements.txt
@@ -1,6 +1,6 @@
 # Block-wise Array Sum Requirements
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
 numpy>=2.3.2
diff --git a/python/2_CoreConcepts/cudaComputeLambdas/README.md b/python/2_CoreConcepts/cudaComputeLambdas/README.md
new file mode 100644
index 00000000..db0586a3
--- /dev/null
+++ b/python/2_CoreConcepts/cudaComputeLambdas/README.md
@@ -0,0 +1,130 @@
+# cudaComputeLambdas (Python)
+
+## Description
+
+This sample demonstrates how **cuda.compute** (from the
+`cuda-cccl` package) accepts plain Python callables, including
+lambdas, as the operators that drive device-wide reductions,
+transforms, and scans. Internally `cuda.compute` JIT-compiles the
+callable through Numba for the GPU, so you can iterate on the
+operator in pure Python and still get a fused device-wide kernel.
+
+The sample exercises three algorithm families:
+
+1. `cuda.compute.reduce_into` - sum via `lambda a, b: a + b`.
+2. `cuda.compute.unary_transform` - elementwise `y = x*x + 1` via a
+   lambda.
+3. `cuda.compute.inclusive_scan` - prefix sum over only the even
+   values, driven by a regular Python function as the binary
+   operator.
+
+## What You'll Learn
+
+- Passing a Python `lambda` directly as the operator to a cuda.compute
+  device algorithm
+- Using a regular Python `def` function for the same purpose when the
+  op is non-trivial
+- The three core algorithm families in cuda.compute: reductions,
+  transforms, and scans
+- How cuda.compute auto-compiles the op to LTO-IR via Numba
+
+## Key Libraries
+
+- [`cuda.compute`](https://nvidia.github.io/cccl/python.html) (from the `cuda-cccl` package) - device algorithms and JIT-compiled Python ops
+- [`cuda.core`](https://nvidia.github.io/cuda-python/cuda-core/latest/) - device setup
+- `cupy` - device buffers
+- `numpy` - scalar init values and host-side verification
+
+## Key APIs
+
+### From `cuda.compute`
+
+- `cuda.compute.reduce_into(d_in, d_out, num_items, op, h_init)` - device-wide reduction
+- `cuda.compute.unary_transform(d_in, d_out, num_items, op)` - elementwise unary transform
+- `cuda.compute.inclusive_scan(d_in, d_out, op, init_value, num_items)` - inclusive prefix scan
+
+### From `cuda_samples_utils`
+
+- `print_gpu_info()` - print device name and compute capability
+
+## Requirements
+
+### Hardware
+
+- NVIDIA GPU with Compute Capability 7.0 or higher
+
+### Software
+
+- CUDA Toolkit 13.0 or newer (cuda.compute compiles ops to LTO-IR via
+  Numba, which needs the toolkit's `nvvm` and `libdevice`).
+- Python 3.10 or newer
+- `cuda-cccl` (>=1.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
+- `numba-cuda` (pulled in transitively by `cuda-cccl`)
+
+If the CUDA toolkit is not on your `PATH`, set `CUDA_HOME` so Numba
+can locate `libdevice`:
+
+```bash
+export CUDA_HOME=/usr/local/cuda
+```
+
+## Installation
+
+Install the required packages from `requirements.txt`:
+
+```bash
+cd /path/to/cuda-samples/python/2_CoreConcepts/cudaComputeLambdas
+pip install -r requirements.txt
+```
+
+The `requirements.txt` installs:
+
+- `cuda-cccl` (>=1.0.0) - ships the `cuda.compute` module
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
+- `numpy` (>=1.24.0)
+
+## How to Run
+
+### Basic usage
+
+```bash
+cd cuda-samples/python/2_CoreConcepts/cudaComputeLambdas
+python cudaComputeLambdas.py
+```
+
+### With custom parameters
+
+```bash
+python cudaComputeLambdas.py --device 1
+```
+
+## Expected Output
+
+```
+Device: <Your GPU Name>
+Compute Capability: <X.Y>
+
+reduce_into(lambda a,b: a+b) over 1..10 -> 55 (expected 55)  OK
+
+unary_transform(lambda x: x*x + 1):
+  got      = [1, 2, 5, 10, 17, 26, 37, 50]
+  expected = [1, 2, 5, 10, 17, 26, 37, 50]  OK
+
+inclusive_scan(add-evens-only) over [1,2,3,4,5,6]:
+  got      = [0, 2, 2, 6, 6, 12]
+  expected = [0, 2, 2, 6, 6, 12]  OK
+
+Done
+```
+
+**Note:** Device name and compute capability will vary based on your GPU.
+
+## Files
+
+- `cudaComputeLambdas.py` - Python implementation
+- `README.md` - This file
+- `requirements.txt` - Sample dependencies
+- `../../Utilities/cuda_samples_utils.py` - Common utilities (imported by this sample)
diff --git a/python/2_CoreConcepts/cudaComputeLambdas/cudaComputeLambdas.py b/python/2_CoreConcepts/cudaComputeLambdas/cudaComputeLambdas.py
new file mode 100644
index 00000000..d790bb4d
--- /dev/null
+++ b/python/2_CoreConcepts/cudaComputeLambdas/cudaComputeLambdas.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+cuda.compute: Python lambdas as device-wide operators
+
+This sample demonstrates how cuda.compute 1.0 (from the cuda-cccl
+package) accepts plain Python callables, including lambdas, as the
+operators that drive device-wide reductions, transforms, and scans.
+Internally cuda.compute JIT-compiles the callable with Numba for the
+device, so you can iterate on the operator in pure Python and still
+get a fused GPU kernel.
+
+The sample exercises three algorithm families with Python lambdas /
+regular functions:
+
+  1. cuda.compute.reduce_into - sum via a lambda.
+  2. cuda.compute.unary_transform - elementwise y = x*x + 1 via a lambda.
+  3. cuda.compute.inclusive_scan - prefix sum over only the even values,
+     using a regular Python function as the binary operator.
+"""
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
+
+try:
+    import cuda.compute
+    import cupy as cp
+    import numpy as np
+    from cuda.core import Device
+    from cuda_samples_utils import print_gpu_info  # noqa: E402
+except ImportError as e:
+    print(f"Error: Required package not found: {e}")
+    print("Please install from requirements.txt:")
+    print("  pip install -r requirements.txt")
+    sys.exit(1)
+
+
+def demo_reduce_lambda() -> bool:
+    """reduce_into driven by a lambda."""
+    dtype = np.int32
+    h_init = np.array([0], dtype=dtype)
+    d_in = cp.arange(1, 11, dtype=dtype)  # 1..10
+    d_out = cp.empty(1, dtype=dtype)
+
+    cuda.compute.reduce_into(
+        d_in=d_in,
+        d_out=d_out,
+        num_items=int(d_in.size),
+        op=lambda a, b: a + b,
+        h_init=h_init,
+    )
+
+    got = int(d_out.get()[0])
+    expected = int(d_in.get().sum())
+    ok = got == expected
+    print(
+        f"reduce_into(lambda a,b: a+b) over 1..10 -> {got} "
+        f"(expected {expected})  {'OK' if ok else 'FAIL'}"
+    )
+    return ok
+
+
+def demo_unary_transform_lambda() -> bool:
+    """unary_transform driven by a lambda: y = x*x + 1."""
+    d_in = cp.arange(8, dtype=cp.int32)
+    d_out = cp.empty_like(d_in)
+
+    cuda.compute.unary_transform(
+        d_in=d_in,
+        d_out=d_out,
+        num_items=int(d_in.size),
+        op=lambda x: x * x + 1,
+    )
+
+    got = d_out.get()
+    expected = (d_in.get().astype(np.int64) ** 2 + 1).astype(np.int32)
+    ok = np.array_equal(got, expected)
+    print(
+        f"unary_transform(lambda x: x*x + 1):\n"
+        f"  got      = {got.tolist()}\n"
+        f"  expected = {expected.tolist()}  {'OK' if ok else 'FAIL'}"
+    )
+    return ok
+
+
+def demo_scan_custom_op() -> bool:
+    """inclusive_scan with a Python function that sums only even values.
+
+    This shows the same pattern that also works for reduce/transform:
+    the Python callable is JIT-compiled for the device by cuda.compute.
+    """
+    dtype = np.int32
+    d_in = cp.array([1, 2, 3, 4, 5, 6], dtype=dtype)
+    d_out = cp.empty_like(d_in)
+    h_init = np.array([0], dtype=dtype)
+
+    def add_evens(a, b):
+        # Treat odd operands as zero; scan accumulates only even values.
+        return (a if a % 2 == 0 else 0) + (b if b % 2 == 0 else 0)
+
+    cuda.compute.inclusive_scan(
+        d_in=d_in,
+        d_out=d_out,
+        op=add_evens,
+        init_value=h_init,
+        num_items=int(d_in.size),
+    )
+
+    got = d_out.get()
+    # Host reference: running sum of even-only projection of the input.
+    h_in = d_in.get()
+    proj = np.where(h_in % 2 == 0, h_in, 0)
+    expected = np.cumsum(proj).astype(dtype)
+    ok = np.array_equal(got, expected)
+    print(
+        f"inclusive_scan(add-evens-only) over [1,2,3,4,5,6]:\n"
+        f"  got      = {got.tolist()}\n"
+        f"  expected = {expected.tolist()}  {'OK' if ok else 'FAIL'}"
+    )
+    return ok
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Drive cuda.compute device algorithms with Python lambdas / callables"
+    )
+    parser.add_argument("--device", type=int, default=0, help="CUDA device id")
+    args = parser.parse_args()
+
+    device = Device(args.device)
+    device.set_current()
+    print_gpu_info(device)
+    print()
+
+    ok = True
+    ok &= demo_reduce_lambda()
+    print()
+    ok &= demo_unary_transform_lambda()
+    print()
+    ok &= demo_scan_custom_op()
+
+    print()
+    if ok:
+        print("Done")
+        return 0
+    print("FAILED")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/python/2_CoreConcepts/cudaComputeLambdas/requirements.txt b/python/2_CoreConcepts/cudaComputeLambdas/requirements.txt
new file mode 100644
index 00000000..3110a76e
--- /dev/null
+++ b/python/2_CoreConcepts/cudaComputeLambdas/requirements.txt
@@ -0,0 +1,4 @@
+cuda-cccl[cu13]>=1.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
+numpy>=1.24.0
diff --git a/python/2_CoreConcepts/cudaGraphs/README.md b/python/2_CoreConcepts/cudaGraphs/README.md
index 60f1d8ba..60b8c5bd 100644
--- a/python/2_CoreConcepts/cudaGraphs/README.md
+++ b/python/2_CoreConcepts/cudaGraphs/README.md
@@ -62,8 +62,8 @@ rebuilding.
 - CUDA Toolkit 13.0 or newer (matches `cuda-python` 13.x)
 - Python 3.10 or newer
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
 
 ## Installation
 
@@ -77,8 +77,8 @@ pip install -r requirements.txt
 The `requirements.txt` installs:
 
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
 
 ## How to Run
 
diff --git a/python/2_CoreConcepts/cudaGraphs/cudaGraphs.py b/python/2_CoreConcepts/cudaGraphs/cudaGraphs.py
index 204bc4f9..01be0a70 100644
--- a/python/2_CoreConcepts/cudaGraphs/cudaGraphs.py
+++ b/python/2_CoreConcepts/cudaGraphs/cudaGraphs.py
@@ -182,7 +182,7 @@ def main() -> int:
     stream = device.create_stream()
     # Tell CuPy to order its allocations on our stream so buffer initialization
     # below is serialized with the kernels we launch.
-    cp.cuda.ExternalStream(int(stream.handle)).use()
+    cp.cuda.Stream.from_external(stream).use()
 
     graph_builder = graph = None
     try:
@@ -214,9 +214,9 @@ def main() -> int:
         t_individual = run_pipeline_individual(
             stream, kernels, config, buffers, N, n_iters=args.iters
         )
-        assert cp.allclose(r3, expected, rtol=1e-5, atol=1e-5), (
-            "Individual pipeline produced incorrect results"
-        )
+        assert cp.allclose(
+            r3, expected, rtol=1e-5, atol=1e-5
+        ), "Individual pipeline produced incorrect results"
         print(
             f"\nIndividual launches: {args.iters} iters in {t_individual:.4f}s"
             f"  ({t_individual * 1e6 / args.iters:.2f} us/iter)"
@@ -228,9 +228,9 @@ def main() -> int:
 
         run_pipeline_graph(stream, graph, n_iters=5)  # warm up
         t_graph = run_pipeline_graph(stream, graph, n_iters=args.iters)
-        assert cp.allclose(r3, expected, rtol=1e-5, atol=1e-5), (
-            "Graph pipeline produced incorrect results"
-        )
+        assert cp.allclose(
+            r3, expected, rtol=1e-5, atol=1e-5
+        ), "Graph pipeline produced incorrect results"
         print(
             f"Graph replay:       {args.iters} iters in {t_graph:.4f}s"
             f"  ({t_graph * 1e6 / args.iters:.2f} us/iter)"
diff --git a/python/2_CoreConcepts/cudaGraphs/requirements.txt b/python/2_CoreConcepts/cudaGraphs/requirements.txt
index 063b924f..c650cd51 100644
--- a/python/2_CoreConcepts/cudaGraphs/requirements.txt
+++ b/python/2_CoreConcepts/cudaGraphs/requirements.txt
@@ -1,5 +1,5 @@
 # CUDA Graphs Sample Requirements
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
diff --git a/python/2_CoreConcepts/fftSignalAnalysis/README.md b/python/2_CoreConcepts/fftSignalAnalysis/README.md
index a90816b4..87c84913 100644
--- a/python/2_CoreConcepts/fftSignalAnalysis/README.md
+++ b/python/2_CoreConcepts/fftSignalAnalysis/README.md
@@ -28,7 +28,7 @@ This sample demonstrates CuPy integration with cuda.core streams:
 stream = device.create_stream()
 
 # Use with CuPy operations
-cp.cuda.ExternalStream(int(stream.handle)).use()
+cp.cuda.Stream.from_external(stream).use()
 ```
 
 ## Key APIs
@@ -43,7 +43,7 @@ cp.cuda.ExternalStream(int(stream.handle)).use()
 
 - `cp.fft.rfft()` - Real-to-complex FFT (GPU-accelerated via cuFFT)
 - `cp.fft.rfftfreq()` - Generate frequency bins for rfft
-- `cp.cuda.ExternalStream()` - Interop with cuda.core streams
+- `cp.cuda.Stream.from_external()` - Interop with cuda.core streams
 
 ### From NumPy:
 
@@ -115,7 +115,7 @@ VERIFICATION
 GPU vs CPU FFT magnitude: Test PASSED
 
 Frequency Detection Accuracy:
-     440 Hz: ✓
+     440 Hz: [OK]
      ...
 
 Done
diff --git a/python/2_CoreConcepts/fftSignalAnalysis/fftSignalAnalysis.py b/python/2_CoreConcepts/fftSignalAnalysis/fftSignalAnalysis.py
index d1582ad5..2edce825 100644
--- a/python/2_CoreConcepts/fftSignalAnalysis/fftSignalAnalysis.py
+++ b/python/2_CoreConcepts/fftSignalAnalysis/fftSignalAnalysis.py
@@ -180,7 +180,7 @@ def run_fft_analysis(
         print(f"Compute Capability: sm_{device.arch}")
 
         # Make CuPy use our cuda.core stream
-        cp.cuda.ExternalStream(int(stream.handle)).use()
+        cp.cuda.Stream.from_external(stream).use()
 
         # Define test signal: composite of multiple frequencies
         test_frequencies = [440.0, 880.0, 1320.0, 2000.0, 5000.0]  # Hz
@@ -208,7 +208,7 @@ def run_fft_analysis(
         print("GPU FFT (cuFFT)")
         print("-" * 60)
 
-        event_opts = EventOptions(enable_timing=True)
+        event_opts = EventOptions(timing_enabled=True)
 
         # Warmup
         d_fft_result = cp.fft.rfft(d_signal)
@@ -291,7 +291,7 @@ def run_fft_analysis(
         all_found = True
         for expected_freq in test_frequencies:
             found = any(abs(f - expected_freq) < 10 for f in detected_freqs)
-            status = "✓" if found else "✗"
+            status = "[OK]" if found else "[FAIL]"
             print(f"  {expected_freq:6.0f} Hz: {status}")
             all_found = all_found and found
 
diff --git a/python/2_CoreConcepts/fftSignalAnalysis/requirements.txt b/python/2_CoreConcepts/fftSignalAnalysis/requirements.txt
index 655b86c2..12db0490 100644
--- a/python/2_CoreConcepts/fftSignalAnalysis/requirements.txt
+++ b/python/2_CoreConcepts/fftSignalAnalysis/requirements.txt
@@ -1,6 +1,6 @@
 # FFT Signal Analysis Requirements
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
 numpy>=2.3.2
diff --git a/python/2_CoreConcepts/greenContext/greenContext.py b/python/2_CoreConcepts/greenContext/greenContext.py
index 008a3807..b12804dc 100644
--- a/python/2_CoreConcepts/greenContext/greenContext.py
+++ b/python/2_CoreConcepts/greenContext/greenContext.py
@@ -417,7 +417,7 @@ def run_critical_alone(
     Establishes the pure compute time with every SM on the device available.
     """
     stream = device.create_stream()
-    out = device.allocate(critical_n * 4)
+    out = device.allocate(critical_n * 4, stream=stream)
     total_sm = device.resources.sm.sm_count
     try:
         opts = EventOptions(timing_enabled=True)
@@ -461,7 +461,7 @@ def run_baseline(
     """Both kernels on the primary context, two non-blocking streams."""
     long_stream = device.create_stream()
     critical_stream = device.create_stream()
-    out = device.allocate(critical_n * 4)
+    out = device.allocate(critical_n * 4, stream=critical_stream)
     total_sm = device.resources.sm.sm_count
     try:
         return _run_one(
@@ -513,7 +513,7 @@ def run_green_context(
 
         long_stream = ctx_long.create_stream()
         critical_stream = ctx_crit.create_stream()
-        out = device.allocate(critical_n * 4)
+        out = device.allocate(critical_n * 4, stream=critical_stream)
 
         return _run_one(
             device,
diff --git a/python/2_CoreConcepts/greenContext/requirements.txt b/python/2_CoreConcepts/greenContext/requirements.txt
index c79eb06c..e25bb158 100644
--- a/python/2_CoreConcepts/greenContext/requirements.txt
+++ b/python/2_CoreConcepts/greenContext/requirements.txt
@@ -1,3 +1,3 @@
 cuda-python>=13.0.0
-cuda-core>=0.7.0
+cuda-core>=1.0.0
 numpy>=2.3.2
diff --git a/python/2_CoreConcepts/jitLtoLinking/README.md b/python/2_CoreConcepts/jitLtoLinking/README.md
index 0e2bfc76..8f7dc7aa 100644
--- a/python/2_CoreConcepts/jitLtoLinking/README.md
+++ b/python/2_CoreConcepts/jitLtoLinking/README.md
@@ -64,8 +64,8 @@ linking modes are verified against a NumPy reference.
 - CUDA Toolkit 13.0 or newer (matches `cuda-python` 13.x)
 - Python 3.10 or newer
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
 
 ## Installation
 
@@ -79,8 +79,8 @@ pip install -r requirements.txt
 The `requirements.txt` installs:
 
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
 
 ## How to Run
 
diff --git a/python/2_CoreConcepts/jitLtoLinking/jitLtoLinking.py b/python/2_CoreConcepts/jitLtoLinking/jitLtoLinking.py
index 56831000..19b61f39 100644
--- a/python/2_CoreConcepts/jitLtoLinking/jitLtoLinking.py
+++ b/python/2_CoreConcepts/jitLtoLinking/jitLtoLinking.py
@@ -140,9 +140,7 @@ def link_lto(device):
     main_obj = Program(MAIN_SRC, "c++", options=prog_opts).compile("ltoir")
     user_obj = Program(USER_SRC, "c++", options=prog_opts).compile("ltoir")
 
-    linker_opts = LinkerOptions(
-        arch=f"sm_{device.arch}", link_time_optimization=True
-    )
+    linker_opts = LinkerOptions(arch=f"sm_{device.arch}", link_time_optimization=True)
     linker = Linker(main_obj, user_obj, options=linker_opts)
     return linker.link("cubin")
 
@@ -175,7 +173,9 @@ def main() -> int:
         description="JIT + LTO linking of two device modules with cuda.core"
     )
     parser.add_argument(
-        "--elements", type=int, default=1 << 16,
+        "--elements",
+        type=int,
+        default=1 << 16,
         help="Number of float32 elements (default: 65536)",
     )
     parser.add_argument("--device", type=int, default=0, help="CUDA device id")
@@ -186,7 +186,7 @@ def main() -> int:
     print_gpu_info(device)
 
     stream = device.create_stream()
-    cp.cuda.ExternalStream(int(stream.handle)).use()
+    cp.cuda.Stream.from_external(stream).use()
 
     try:
         N = args.elements
diff --git a/python/2_CoreConcepts/jitLtoLinking/requirements.txt b/python/2_CoreConcepts/jitLtoLinking/requirements.txt
index ff318f02..3b328a39 100644
--- a/python/2_CoreConcepts/jitLtoLinking/requirements.txt
+++ b/python/2_CoreConcepts/jitLtoLinking/requirements.txt
@@ -1,5 +1,5 @@
 # JIT + LTO Linking Sample Requirements
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
diff --git a/python/2_CoreConcepts/launchConfigTuning/README.md b/python/2_CoreConcepts/launchConfigTuning/README.md
index 56b83bf2..6870f1d7 100644
--- a/python/2_CoreConcepts/launchConfigTuning/README.md
+++ b/python/2_CoreConcepts/launchConfigTuning/README.md
@@ -90,6 +90,15 @@ elapsed_ms = (end_event - start_event) / n_iterations
 - Python 3.10 or newer
 - See `requirements.txt` for Python packages
 
+### Platform Support:
+
+The benchmark loops in this sample read kernel results back from
+`ManagedMemoryResource` allocations between launches, which requires the
+device property `concurrent_managed_access=True`. This is only supported on
+Linux with HMM (Pascal and newer). On Windows (WDDM/MCDM/TCC) the property
+is `False`, so the sample exits early with a waive message and exit code
+`2`.
+
 ## Installation
 
 ```bash
@@ -115,8 +124,8 @@ Compute Capability: X.X
 
 Compiling CUDA kernels with cuda.core.Program...
   Target architecture: sm_XX
-  ✓ vector_add kernel compiled
-  ✓ reduce_sum kernel compiled
+  [OK] vector_add kernel compiled
+  [OK] reduce_sum kernel compiled
 
 ============================================================
 VECTOR ADDITION - Launch Configuration Tuning
@@ -132,11 +141,11 @@ Block Size:   64 | Blocks: 156250 | Time: X.XXXX ± X.XXXX ms
 ...
 ------------------------------------------------------------
 
-✓ OPTIMAL: block_size=XXX (X.XXXX ms)
-✗ WORST:   block_size=XXX (X.XXXX ms)
+[OK] OPTIMAL: block_size=XXX (X.XXXX ms)
+[FAIL] WORST: block_size=XXX (X.XXXX ms)
   Speedup: X.XXx
 
-✓ Results verified correct!
+[OK] Results verified correct!
 
 ...
 
diff --git a/python/2_CoreConcepts/launchConfigTuning/launchConfigTuning.py b/python/2_CoreConcepts/launchConfigTuning/launchConfigTuning.py
index 8ba9f5b5..d854ee4f 100644
--- a/python/2_CoreConcepts/launchConfigTuning/launchConfigTuning.py
+++ b/python/2_CoreConcepts/launchConfigTuning/launchConfigTuning.py
@@ -148,7 +148,7 @@ def benchmark_kernel_1d(
     stream.sync()
 
     # Timed runs with CUDA events
-    event_opts = EventOptions(enable_timing=True)
+    event_opts = EventOptions(timing_enabled=True)
     start_event = device.create_event(options=event_opts)
     end_event = device.create_event(options=event_opts)
 
@@ -178,7 +178,7 @@ def print_gpu_info(device):
 def allocate_managed_array(mr, stream, n_elements, dtype=np.float32):
     """Allocate device-preferred unified memory and return buffer with numpy view."""
     n_bytes = n_elements * np.dtype(dtype).itemsize
-    buffer = mr.allocate(n_bytes, stream)
+    buffer = mr.allocate(n_bytes, stream=stream)
     stream.sync()
 
     # Zero-copy numpy view via DLPack (holds reference to buffer)
@@ -240,11 +240,11 @@ def demo_vector_add_tuning(device, stream, mr, kernel):
 
         print("-" * 60)
         print(
-            f"\n✓ OPTIMAL: block_size={best['block_size']} "
+            f"\n[OK] OPTIMAL: block_size={best['block_size']} "
             f"({best['mean_time_ms']:.4f} ms)"
         )
         print(
-            f"✗ WORST:   block_size={worst['block_size']} "
+            f"[FAIL] WORST: block_size={worst['block_size']} "
             f"({worst['mean_time_ms']:.4f} ms)"
         )
         print(f"  Speedup: {worst['mean_time_ms']/best['mean_time_ms']:.2f}x")
@@ -253,7 +253,7 @@ def demo_vector_add_tuning(device, stream, mr, kernel):
         stream.sync()
         expected = np_a + np_b
         if np.allclose(np_c, expected):
-            print("\n✓ Results verified correct!")
+            print("\n[OK] Results verified correct!")
 
         return results
     finally:
@@ -316,7 +316,7 @@ def demo_reduction_tuning(device, stream, mr, kernel):
         worst = max(results, key=lambda x: x["mean_time_ms"])
 
         print("-" * 60)
-        print(f"\n✓ OPTIMAL: block_size={best['block_size']}")
+        print(f"\n[OK] OPTIMAL: block_size={best['block_size']}")
         print(
             f"  Speedup over worst: {worst['mean_time_ms']/best['mean_time_ms']:.2f}x"
         )
@@ -341,6 +341,14 @@ def main():
     3. Benchmarking different thread block configurations
     4. Finding optimal threads-per-block for various kernel types
     """
+    if sys.platform == "win32":
+        print(
+            "This sample relies on ManagedMemoryResource with concurrent host "
+            "access, which is not supported on Windows "
+            "(concurrent_managed_access=False). Waiving this sample."
+        )
+        sys.exit(2)
+
     print("=" * 60)
     print("Launch Configuration Tuning (cuda.core)")
     print("Finding the Best Block Size for Your Kernel")
@@ -365,10 +373,10 @@ def main():
         print(f"  Target architecture: {arch}")
 
         vec_add_kernel = compile_kernel(device, VECTOR_ADD_KERNEL, "vector_add")
-        print("  ✓ vector_add kernel compiled")
+        print("  [OK] vector_add kernel compiled")
 
         reduction_kernel = compile_kernel(device, REDUCTION_KERNEL, "reduce_sum")
-        print("  ✓ reduce_sum kernel compiled")
+        print("  [OK] reduce_sum kernel compiled")
 
         # Run demonstrations
         demo_vector_add_tuning(device, stream, mr, vec_add_kernel)
diff --git a/python/2_CoreConcepts/launchConfigTuning/requirements.txt b/python/2_CoreConcepts/launchConfigTuning/requirements.txt
index f493d29a..c9685b2f 100644
--- a/python/2_CoreConcepts/launchConfigTuning/requirements.txt
+++ b/python/2_CoreConcepts/launchConfigTuning/requirements.txt
@@ -2,5 +2,5 @@
 # Requires Python 3.10+, CUDA Toolkit 13.0+
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
 numpy>=2.3.2
diff --git a/python/2_CoreConcepts/matrixMulSharedMem/README.md b/python/2_CoreConcepts/matrixMulSharedMem/README.md
index 37affdf2..67c0df69 100644
--- a/python/2_CoreConcepts/matrixMulSharedMem/README.md
+++ b/python/2_CoreConcepts/matrixMulSharedMem/README.md
@@ -1,5 +1,25 @@
 # Matrix Multiplication with Shared Memory (GEMM)
 
+> **Known issue — version-pinned sample.** Unlike the other samples in this
+> repository, this sample is pinned to `cuda-core==0.7.0` and
+> `nvmath-python[cu13]==0.9.0`. The reason is that nvmath-python 0.9.0
+> still uses `cuda-core`'s pre-1.0 API name `EventOptions(enable_timing=...)`
+> in its own internals, which `cuda-core>=1.0` no longer accepts.
+>
+> If you install this sample's `requirements.txt` into the same environment
+> as the other samples, pip will downgrade `cuda-core` and the other
+> samples (which use the 1.0 API) will stop working. The recommended
+> workflow is one of:
+>
+> - Install this sample's requirements in a **dedicated virtual
+>   environment**, or
+> - Re-run the other samples' `pip install -r requirements.txt` afterwards
+>   to upgrade `cuda-core` back to 1.0.
+>
+> This sample will be re-aligned with the rest of the repository
+> (`cuda-core>=1.0.0`) once nvmath-python ships a release that targets
+> cuda-core's 1.0 naming audit.
+
 Demonstrates efficient matrix multiplication using nvmath-python APIs and custom CUDA kernels with tiling, shared memory, and loop unrolling.
 
 ## Overview
@@ -78,7 +98,7 @@ Using nvmath and cuda.core APIs
 Device: NVIDIA GeForce RTX 4090
 Compute Capability: sm_89
 
-Custom kernel compiled ✓
+Custom kernel compiled [OK]
 
 Matrix dimensions: A(1024x1024) × B(1024x1024) = C(1024x1024)
 Custom kernel tile size: 16x16
diff --git a/python/2_CoreConcepts/matrixMulSharedMem/matrixMulSharedMem.py b/python/2_CoreConcepts/matrixMulSharedMem/matrixMulSharedMem.py
index 8f809f3e..a9f85533 100644
--- a/python/2_CoreConcepts/matrixMulSharedMem/matrixMulSharedMem.py
+++ b/python/2_CoreConcepts/matrixMulSharedMem/matrixMulSharedMem.py
@@ -31,7 +31,7 @@ Demonstrates efficient matrix multiplication using:
 - nvmath.linalg.advanced.Matmul for high-performance GEMM via cuBLASLt
 - Custom CUDA kernel with tiling, shared memory, and loop unrolling
 
-Uses cuda.core APIs with CuPy arrays via ExternalStream.
+Uses cuda.core APIs with CuPy arrays via Stream.from_external.
 """
 
 import sys
@@ -121,17 +121,22 @@ def run_matmul_benchmark(
     print(f"Compute Capability: sm_{device.arch}")
 
     # Make CuPy use our cuda.core stream
-    cp.cuda.ExternalStream(int(stream.handle)).use()
+    cp.cuda.Stream.from_external(stream).use()
 
     # Compile custom kernel
     arch = f"sm_{device.arch}"
     program = Program(MATMUL_KERNEL, code_type="c++", options=ProgramOptions(arch=arch))
     kernel = program.compile(target_type="cubin").get_kernel("matmul_shared")
-    print("Custom kernel compiled ✓")
+    print("Custom kernel compiled [OK]")
 
     # Setup
     print(f"\nMatrix: A({m}x{k}) × B({k}x{n}) = C({m}x{n})")
     total_ops = 2 * m * n * k
+    # NOTE: this sample is pinned to cuda-core==0.7.0 (see requirements.txt)
+    # because nvmath-python 0.9.0 still uses cuda-core's pre-1.0 API name
+    # `enable_timing`. Once nvmath ships a release compatible with cuda-core
+    # 1.0, bump the pins in requirements.txt and rename this kwarg to
+    # `timing_enabled` to match the rest of the samples.
     event_opts = EventOptions(enable_timing=True)
 
     # Allocate matrices
diff --git a/python/2_CoreConcepts/matrixMulSharedMem/requirements.txt b/python/2_CoreConcepts/matrixMulSharedMem/requirements.txt
index 2c6cf2b2..0b397174 100644
--- a/python/2_CoreConcepts/matrixMulSharedMem/requirements.txt
+++ b/python/2_CoreConcepts/matrixMulSharedMem/requirements.txt
@@ -1,7 +1,20 @@
 # Matrix Multiplication with Shared Memory (GEMM) Requirements
+#
+# IMPORTANT: this sample pins older versions of cuda-core and nvmath-python
+# on purpose. nvmath-python 0.9.0 (the current CUDA-13 release at the time
+# of CTK 13.3) calls cuda-core's pre-1.0 API name `EventOptions(enable_timing=...)`
+# in its own internals. With cuda-core 1.0+ that kwarg was renamed to
+# `timing_enabled` and the old name is rejected, so any cuda-core>=1.0 +
+# nvmath-python 0.9.0 combination raises a TypeError at runtime.
+#
+# Until nvmath-python ships a release that targets the cuda-core 1.0 naming
+# audit, this sample requires the older cuda-core 0.7 line. Installing this
+# requirements.txt into the same environment as the other samples will
+# downgrade cuda-core; use a dedicated venv for this sample, or reinstall
+# the other samples' requirements afterwards to upgrade cuda-core back.
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core==0.7.0
+cupy-cuda13x>=14.0.0
 numpy>=2.3.2
-nvmath-python[cu13]>=0.3.0
+nvmath-python[cu13]==0.9.0
diff --git a/python/2_CoreConcepts/memoryResources/README.md b/python/2_CoreConcepts/memoryResources/README.md
index cc563fe5..6d6d6a94 100644
--- a/python/2_CoreConcepts/memoryResources/README.md
+++ b/python/2_CoreConcepts/memoryResources/README.md
@@ -60,15 +60,27 @@ verified on the host.
 ### Hardware
 
 - NVIDIA GPU with Compute Capability 7.0 or higher
-- Managed memory support (most discrete GPUs on Linux and Windows)
+- Managed memory support (most discrete GPUs)
 
 ### Software
 
 - CUDA Toolkit 13.0 or newer (matches `cuda-python` 13.x)
 - Python 3.10 or newer
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
+
+### Platform Support
+
+The `ManagedMemoryResource` demo in this sample exercises **concurrent host
+access** to managed allocations while the GPU is active, which requires the
+device property `concurrent_managed_access=True`. This is only supported on
+Linux with HMM (Pascal and newer). On Windows (WDDM/MCDM/TCC) the property
+is `False`, so the sample exits early with a waive message and exit code
+`2`. The `DeviceMemoryResource` + `PinnedMemoryResource` demos in this
+sample would still work on Windows on their own, but to keep the sample
+self-contained the entire script waives when concurrent managed access is
+unavailable.
 
 ## Installation
 
@@ -82,8 +94,8 @@ pip install -r requirements.txt
 The `requirements.txt` installs:
 
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
 
 ## How to Run
 
diff --git a/python/2_CoreConcepts/memoryResources/memoryResources.py b/python/2_CoreConcepts/memoryResources/memoryResources.py
index a4573a49..07776c18 100644
--- a/python/2_CoreConcepts/memoryResources/memoryResources.py
+++ b/python/2_CoreConcepts/memoryResources/memoryResources.py
@@ -165,9 +165,9 @@ def demo_managed(device, stream, kernel, size):
         stream.sync()
 
         # No explicit copy: the same numpy view observes the GPU's writes.
-        assert np.allclose(managed_view, original * 0.5 + 10.0), (
-            "Managed memory result mismatch"
-        )
+        assert np.allclose(
+            managed_view, original * 0.5 + 10.0
+        ), "Managed memory result mismatch"
         print("  GPU writes observed directly through the host-visible mapping")
     finally:
         managed_buffer.close(stream)
@@ -222,6 +222,14 @@ def main():
     parser.add_argument("--device", type=int, default=0, help="CUDA device id")
     args = parser.parse_args()
 
+    if sys.platform == "win32":
+        print(
+            "This sample relies on ManagedMemoryResource with concurrent host "
+            "access, which is not supported on Windows "
+            "(concurrent_managed_access=False). Waiving this sample."
+        )
+        sys.exit(2)
+
     device = Device(args.device)
     device.set_current()
     print_gpu_info(device)
diff --git a/python/2_CoreConcepts/memoryResources/requirements.txt b/python/2_CoreConcepts/memoryResources/requirements.txt
index 5aa8ca4c..641e4e20 100644
--- a/python/2_CoreConcepts/memoryResources/requirements.txt
+++ b/python/2_CoreConcepts/memoryResources/requirements.txt
@@ -1,5 +1,5 @@
 # Memory Resources Sample Requirements
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
diff --git a/python/2_CoreConcepts/pageRank/README.md b/python/2_CoreConcepts/pageRank/README.md
index 64ac4b48..3be6964c 100644
--- a/python/2_CoreConcepts/pageRank/README.md
+++ b/python/2_CoreConcepts/pageRank/README.md
@@ -1,5 +1,23 @@
 # Sample: PageRank Algorithm (Python)
 
+> **Known issue — version-pinned sample.** Unlike the other samples in this
+> repository, this sample is pinned to `cuda-core<1.0.0`. The reason is that
+> `cudf-cu13` transitively requires `numba-cuda<0.29.0`, and every
+> `numba-cuda` release in that range pins `cuda-core<1.0.0`. Installing this
+> sample's `requirements.txt` into a shared environment will downgrade
+> `cuda-core` and break the other samples (which use the 1.0 API).
+>
+> The recommended workflow is one of:
+>
+> - Install this sample's requirements in a **dedicated virtual
+>   environment**, or
+> - Re-run the other samples' `pip install -r requirements.txt` afterwards
+>   to upgrade `cuda-core` back to 1.0.
+>
+> This sample will be re-aligned with the rest of the repository
+> (`cuda-core>=1.0.0`) once `cudf-cu13` ships a release that lifts its
+> `numba-cuda` upper bound.
+
 ## Description
 
 Demonstrates GPU-accelerated PageRank computation for graph analysis using RAPIDS cuGraph, with cuda.core for device, stream, and GPU timing. This sample focuses on cuda.core integration with high-level libraries (cuGraph/cuDF); for custom kernel programming (Program, LaunchConfig, launch), see the blockwiseSum sample.
@@ -16,7 +34,7 @@ Demonstrates GPU-accelerated PageRank computation for graph analysis using RAPID
 - `cugraph` - RAPIDS GPU-accelerated graph analytics
 - `cudf` - RAPIDS GPU DataFrame library
 - `cuda.core` - Device, stream, and event APIs for GPU timing
-- `cupy` - GPU array library (ExternalStream for cuDF/cuGraph)
+- `cupy` - GPU array library (Stream.from_external for cuDF/cuGraph)
 - `numpy` - CPU reference implementation
 
 ## Key APIs
@@ -25,7 +43,7 @@ Demonstrates GPU-accelerated PageRank computation for graph analysis using RAPID
 
 - `Device(0)` - Create device, `device.set_current()`, `device.create_stream()`
 - `EventOptions(enable_timing=True)` - GPU timing via `stream.record()`
-- `cp.cuda.ExternalStream(stream.handle).use()` - Make cuDF/cuGraph use cuda.core stream
+- `cp.cuda.Stream.from_external(stream).use()` - Make cuDF/cuGraph use cuda.core stream
 
 ### From cuGraph:
 
@@ -50,6 +68,14 @@ Demonstrates GPU-accelerated PageRank computation for graph analysis using RAPID
 - Python 3.10 or newer
 - See requirements.txt for package dependencies
 
+### Platform Support:
+
+This sample depends on RAPIDS (`cugraph-cu13`, `cudf-cu13`, `dask-cuda`),
+which is currently published only as **Linux (manylinux) wheels** on
+`pypi.nvidia.com` — no Windows wheels exist. On Windows the sample exits
+early with a waive message and exit code `2` instead of attempting an
+install that cannot succeed.
+
 ## Installation
 
 ```bash
diff --git a/python/2_CoreConcepts/pageRank/pageRank.py b/python/2_CoreConcepts/pageRank/pageRank.py
index 97a74b94..8aba622d 100644
--- a/python/2_CoreConcepts/pageRank/pageRank.py
+++ b/python/2_CoreConcepts/pageRank/pageRank.py
@@ -46,6 +46,14 @@ from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
 from cuda_samples_utils import print_gpu_info, verify_array_result  # noqa: E402
 
+if sys.platform == "win32":
+    print(
+        "This sample depends on RAPIDS (cugraph-cu13 / cudf-cu13), which is "
+        "currently published only as Linux (manylinux) wheels on "
+        "pypi.nvidia.com. Waiving this sample on Windows."
+    )
+    sys.exit(2)
+
 try:
     import cudf
     import cugraph
@@ -210,8 +218,22 @@ def run_pagerank_benchmark(
     print()
     print_gpu_info(device)
 
+    # RAPIDS cuGraph wheels currently don't ship kernel binaries for
+    # every CUDA architecture. Skip cleanly on architectures known to
+    # be unsupported instead of failing deep inside cuGraph with a
+    # cryptic cudaErrorNoKernelImageForDevice. Remove an arch from this
+    # set once the matching cuGraph release ships kernels for it.
+    _CUGRAPH_UNSUPPORTED_ARCHES = {"110"}  # sm_110 = Thor / Tegra
+    if device.arch in _CUGRAPH_UNSUPPORTED_ARCHES:
+        print(
+            f"RAPIDS cuGraph does not yet ship kernels for sm_{device.arch}, "
+            "waiving this sample."
+        )
+        stream.close()
+        sys.exit(2)
+
     # Make CuPy/cuDF use our cuda.core stream
-    cp.cuda.ExternalStream(int(stream.handle)).use()
+    cp.cuda.Stream.from_external(stream).use()
 
     # Generate random graph
     print("\nGraph Parameters:")
diff --git a/python/2_CoreConcepts/pageRank/requirements.txt b/python/2_CoreConcepts/pageRank/requirements.txt
index 7ed31698..fafc9dae 100644
--- a/python/2_CoreConcepts/pageRank/requirements.txt
+++ b/python/2_CoreConcepts/pageRank/requirements.txt
@@ -1,8 +1,11 @@
 # PageRank Requirements (RAPIDS cuGraph)
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cugraph-cu13>=25.0.0
-cudf-cu13>=25.0.0
-cupy-cuda13x>=13.0.0
+# cudf-cu13 transitively pins numba-cuda<0.29.0 which requires cuda-core<1.0.0
+cuda-core<1.0.0
+cugraph-cu13>=26.0.0
+cudf-cu13>=26.0.0
+# dask-cuda <26.4 incorrectly pins cuda-core==0.3.*; require the fixed release
+dask-cuda>=26.4.0
+cupy-cuda13x>=14.0.0
 numpy>=2.3.2
diff --git a/python/2_CoreConcepts/parallelHistogram/parallelHistogram.py b/python/2_CoreConcepts/parallelHistogram/parallelHistogram.py
index d613ef5b..9fcd3afc 100644
--- a/python/2_CoreConcepts/parallelHistogram/parallelHistogram.py
+++ b/python/2_CoreConcepts/parallelHistogram/parallelHistogram.py
@@ -190,7 +190,7 @@ def _run_histogram(device, stream):
     # Benchmark using cuda.core Events (explicit Event objects recorded on stream)
     print("\nBenchmarking (100 iterations)...")
     num_iterations = 100
-    event_opts = EventOptions(enable_timing=True)
+    event_opts = EventOptions(timing_enabled=True)
     start_event = device.create_event(options=event_opts)
     end_event = device.create_event(options=event_opts)
 
diff --git a/python/2_CoreConcepts/parallelHistogram/requirements.txt b/python/2_CoreConcepts/parallelHistogram/requirements.txt
index da3c7a95..7f6d06eb 100644
--- a/python/2_CoreConcepts/parallelHistogram/requirements.txt
+++ b/python/2_CoreConcepts/parallelHistogram/requirements.txt
@@ -2,6 +2,6 @@
 # Requires Python 3.10+, CUDA Toolkit 13.0+
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
 numpy>=2.3.2
-cupy-cuda13x>=13.0.0
+cupy-cuda13x>=14.0.0
diff --git a/python/2_CoreConcepts/parallelReduction/README.md b/python/2_CoreConcepts/parallelReduction/README.md
index d25712d3..7ca010f2 100644
--- a/python/2_CoreConcepts/parallelReduction/README.md
+++ b/python/2_CoreConcepts/parallelReduction/README.md
@@ -52,9 +52,9 @@ if (tid % (2 * s) == 0) {  // Don't do this!
 - CUDA Toolkit 13.0+
 - Python 3.10+
 - `cuda-python` (13.0.0+)
-- `cuda-core` (>=0.6.0)
+- `cuda-core` (>=1.0.0)
 - `cuda-cccl` (1.0.0+)
-- `cupy` (13.0.0+)
+- `cupy-cuda13x` (>=14.0.0)
 - `numpy` (>=2.3.2)
 
 ## Installation
diff --git a/python/2_CoreConcepts/parallelReduction/parallelReduction.py b/python/2_CoreConcepts/parallelReduction/parallelReduction.py
index 9f853034..f056200d 100644
--- a/python/2_CoreConcepts/parallelReduction/parallelReduction.py
+++ b/python/2_CoreConcepts/parallelReduction/parallelReduction.py
@@ -35,7 +35,7 @@ Key Concepts:
 - Reduction tree pattern: Divide-and-conquer parallel algorithm
 - Thread synchronization: Using __syncthreads() for coordination
 - Sequential thread IDs: How to avoid warp divergence
-- cuda.core Stream integration with CuPy via ExternalStream
+- cuda.core Stream integration with CuPy via Stream.from_external
 """
 
 import math
@@ -217,7 +217,7 @@ def benchmark_custom(
         stream, kernel, d_input, block_size=block_size, work_buffers=work_buffers
     )
 
-    event_opts = {"enable_timing": True}
+    event_opts = {"timing_enabled": True}
     start_event = stream.device.create_event(options=event_opts)
     end_event = stream.device.create_event(options=event_opts)
 
@@ -264,7 +264,7 @@ def benchmark_cuda_compute(
     stream.sync()
 
     d_output = cp.empty(1, dtype=cp.float32)
-    event_opts = {"enable_timing": True}
+    event_opts = {"timing_enabled": True}
     start_event = stream.device.create_event(options=event_opts)
     end_event = stream.device.create_event(options=event_opts)
 
@@ -299,7 +299,7 @@ def main() -> bool:
     device = Device(0)
     device.set_current()
     stream = device.create_stream()
-    cp_stream = cp.cuda.ExternalStream(int(stream.handle))
+    cp_stream = cp.cuda.Stream.from_external(stream)
 
     print()
     print_gpu_info(device)
diff --git a/python/2_CoreConcepts/parallelReduction/requirements.txt b/python/2_CoreConcepts/parallelReduction/requirements.txt
index d0b1eff5..a3acd886 100644
--- a/python/2_CoreConcepts/parallelReduction/requirements.txt
+++ b/python/2_CoreConcepts/parallelReduction/requirements.txt
@@ -1,7 +1,7 @@
 # Parallel Reduction Sample Requirements
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
 cuda-cccl>=1.0.0
-cupy-cuda13x>=13.0.0
+cupy-cuda13x>=14.0.0
 numpy>=2.3.2
diff --git a/python/2_CoreConcepts/prefixSum/README.md b/python/2_CoreConcepts/prefixSum/README.md
index d60ba56e..6542dbf3 100644
--- a/python/2_CoreConcepts/prefixSum/README.md
+++ b/python/2_CoreConcepts/prefixSum/README.md
@@ -8,7 +8,7 @@ Demonstrates parallel prefix sum (scan) algorithms using cuda.compute with cuda.
 - Exclusive scan: `output[i] = init_value + input[0] + input[1] + ... + input[i-1]`
 - Uses cuda.compute APIs for optimized CUB-based implementations
 - Uses cuda.core APIs for device and stream management
-- Demonstrates CuPy integration via `ExternalStream`
+- Demonstrates CuPy integration via `Stream.from_external`
 
 ## Requirements
 
@@ -21,9 +21,9 @@ Demonstrates parallel prefix sum (scan) algorithms using cuda.compute with cuda.
 - CUDA Toolkit 13.0+
 - Python 3.10+
 - `cuda-python` (13.0.0+)
-- `cuda-core` (>=0.6.0)
+- `cuda-core` (>=1.0.0)
 - `cuda-cccl` (1.0.0+)
-- `cupy-cuda13x` (13.0.0+)
+- `cupy-cuda13x` (14.0.0+)
 - `numpy` (>=2.3.2)
 
 ## Usage
@@ -56,8 +56,8 @@ This sample demonstrates proper stream usage across libraries:
 # Create stream with cuda.core
 stream = device.create_stream()
 
-# Wrap for CuPy compatibility (requires int handle)
-cp_stream = cp.cuda.ExternalStream(int(stream.handle))
+# Wrap for CuPy compatibility (cuda.core Stream implements the __cuda_stream__ protocol)
+cp_stream = cp.cuda.Stream.from_external(stream)
 
 # Use with CuPy operations
 with cp_stream:
diff --git a/python/2_CoreConcepts/prefixSum/prefixSum.py b/python/2_CoreConcepts/prefixSum/prefixSum.py
index 8ca413e8..ea812071 100644
--- a/python/2_CoreConcepts/prefixSum/prefixSum.py
+++ b/python/2_CoreConcepts/prefixSum/prefixSum.py
@@ -62,7 +62,7 @@ def main() -> bool:
     device = Device(0)
     device.set_current()
     stream = device.create_stream()
-    cp_stream = cp.cuda.ExternalStream(int(stream.handle))
+    cp_stream = cp.cuda.Stream.from_external(stream)
 
     ok = True
     try:
@@ -153,7 +153,7 @@ def main() -> bool:
         )
         stream.sync()
 
-        event_opts = EventOptions(enable_timing=True)
+        event_opts = EventOptions(timing_enabled=True)
         start_event = device.create_event(options=event_opts)
         end_event = device.create_event(options=event_opts)
 
@@ -184,7 +184,7 @@ def main() -> bool:
         print("• Inclusive: output[i] includes input[i]")
         print("• Exclusive: output[i] excludes input[i], starts with init_value")
         print("• cuda.compute provides CUB-based optimized implementations")
-        print("• cuda.core Stream integrates with CuPy via ExternalStream")
+        print("• cuda.core Stream integrates with CuPy via Stream.from_external")
         print("• Applications: stream compaction, radix sort, histograms")
         print("=" * 60)
         return ok
diff --git a/python/2_CoreConcepts/prefixSum/requirements.txt b/python/2_CoreConcepts/prefixSum/requirements.txt
index 2bb41bf7..b05c5375 100644
--- a/python/2_CoreConcepts/prefixSum/requirements.txt
+++ b/python/2_CoreConcepts/prefixSum/requirements.txt
@@ -2,7 +2,7 @@
 # Requires Python 3.10+, CUDA Toolkit 13.0+
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
 cuda-cccl>=1.0.0
-cupy-cuda13x>=13.0.0
+cupy-cuda13x>=14.0.0
 numpy>=2.3.2
diff --git a/python/2_CoreConcepts/processCheckpoint/processCheckpoint.py b/python/2_CoreConcepts/processCheckpoint/processCheckpoint.py
index 4115bda9..bb2146cf 100644
--- a/python/2_CoreConcepts/processCheckpoint/processCheckpoint.py
+++ b/python/2_CoreConcepts/processCheckpoint/processCheckpoint.py
@@ -195,6 +195,19 @@ def main():
     print(f"Buffer size:        {args.buffer_mib} MiB")
     print(f"Lock timeout:       {args.lock_timeout_ms} ms")
 
+    # CUDA process checkpointing relies on kernel-mode driver features
+    # that aren't shipped on integrated-GPU platforms (e.g. Tegra /
+    # Jetson / Thor). On those, Process.lock() can hang indefinitely
+    # instead of returning a clean "not supported" error. Skip cleanly
+    # rather than hanging. Remove this guard once integrated platforms
+    # gain checkpoint support.
+    if device.properties.integrated:
+        print(
+            f"CUDA process checkpointing is not supported on integrated "
+            f"GPUs (sm_{device.arch}), waiving this sample."
+        )
+        return 2
+
     print()
     print("Compiling kernel ...")
     fill_kernel = compile_fill_kernel(device)
diff --git a/python/2_CoreConcepts/processCheckpoint/requirements.txt b/python/2_CoreConcepts/processCheckpoint/requirements.txt
index c79eb06c..a0605400 100644
--- a/python/2_CoreConcepts/processCheckpoint/requirements.txt
+++ b/python/2_CoreConcepts/processCheckpoint/requirements.txt
@@ -1,3 +1,4 @@
-cuda-python>=13.0.0
-cuda-core>=0.7.0
+# cuda-bindings 13.3.0 drops the CUcheckpointRestoreArgs alias that cuda-core requires
+cuda-python>=13.0.0,<13.3.0
+cuda-core>=1.0.0
 numpy>=2.3.2
diff --git a/python/2_CoreConcepts/reduction/README.md b/python/2_CoreConcepts/reduction/README.md
index a70f767d..0f2a0056 100644
--- a/python/2_CoreConcepts/reduction/README.md
+++ b/python/2_CoreConcepts/reduction/README.md
@@ -82,7 +82,7 @@ Two-Stage Reduction Strategy:
     - Elements per block: 512
     - Output: 32768 partial sums
   Stage 2: CPU final reduction
-    - Combine 32768 partial sums → 1 final result
+    - Combine 32768 partial sums -> 1 final result
 
 Compiling CUDA kernel...
   Kernel 'blockReduceKernel_float' compiled successfully
diff --git a/python/2_CoreConcepts/reduction/reduction.py b/python/2_CoreConcepts/reduction/reduction.py
index cf838e2f..f2f0c0ac 100644
--- a/python/2_CoreConcepts/reduction/reduction.py
+++ b/python/2_CoreConcepts/reduction/reduction.py
@@ -259,7 +259,7 @@ def run(
     print(f"    - Elements per block: {threads_per_block * 2}")
     print(f"    - Output: {num_blocks} partial sums")
     print("  Stage 2: CPU final reduction")
-    print(f"    - Combine {num_blocks} partial sums → 1 final result")
+    print(f"    - Combine {num_blocks} partial sums -> 1 final result")
 
     # Compile kernel
     print("\nCompiling CUDA kernel...")
@@ -320,7 +320,7 @@ def run(
 
         # cuda.core event elapsed time (end - start) is in milliseconds (CUDA API).
         stage1_times_ms = []
-        event_options = EventOptions(enable_timing=True)
+        event_options = EventOptions(timing_enabled=True)
         start_event = stream.device.create_event(options=event_options)
         end_event = stream.device.create_event(options=event_options)
         for _ in range(test_iterations):
@@ -342,7 +342,7 @@ def run(
 
         # Stage 2 (CPU)
         print("\n> Running Stage 2 (CPU final reduction)...")
-        # Device → Host: after stream sync, partial sums are visible on host.
+        # Device -> Host: after stream sync, partial sums are visible on host.
         stream.sync()
         with cp_stream:
             h_blockSums = cp.asnumpy(d_blockSums)
diff --git a/python/2_CoreConcepts/reduction/requirements.txt b/python/2_CoreConcepts/reduction/requirements.txt
index 17ed73c7..84a6b40a 100644
--- a/python/2_CoreConcepts/reduction/requirements.txt
+++ b/python/2_CoreConcepts/reduction/requirements.txt
@@ -2,7 +2,7 @@
 # Install with: pip install -r requirements.txt
 
 numpy>=2.3.2
-cuda-core>=0.6.0
+cuda-core>=1.0.0
 cuda-python>=13.0.0
 # Use cupy-cuda13x>=14.0.0 for cp.cuda.Stream.from_external(stream)
 cupy-cuda13x>=14.0.0
diff --git a/python/2_CoreConcepts/reductionMultiBlockCG/README.md b/python/2_CoreConcepts/reductionMultiBlockCG/README.md
index bb359f0b..825e3506 100644
--- a/python/2_CoreConcepts/reductionMultiBlockCG/README.md
+++ b/python/2_CoreConcepts/reductionMultiBlockCG/README.md
@@ -33,13 +33,24 @@ Pick a CuPy wheel that matches your CUDA major version (e.g. `cupy-cuda13x` in `
 
 ## How to run
 
-**`--cuda-include-dir` is required** (colon-separated list). Typical desktop layout:
+**`--cuda-include-dir` is required.** Multiple paths can be combined using the
+OS path separator (`:` on Linux/macOS, `;` on Windows).
+
+Linux / macOS:
 
 ```bash
 python reductionMultiBlockCG.py \
   --cuda-include-dir /usr/local/cuda/include/cccl:/usr/local/cuda/include
 ```
 
+Windows (PowerShell or cmd, note the `;` separator and quotes around the
+combined value):
+
+```powershell
+python reductionMultiBlockCG.py `
+  --cuda-include-dir "$env:CUDA_PATH\include;$env:CUDA_PATH\include\cccl"
+```
+
 **Jetson / split include trees:** pass every directory NVRTC needs in one `--cuda-include-dir` argument, e.g.
 `/usr/local/cuda/include/cccl:/usr/local/cuda/targets/sbsa-linux/include` (adjust paths to your image). If headers are scattered, you can instead merge them into one tree with symlinks and point `--cuda-include-dir` at that folder.
 
@@ -94,15 +105,15 @@ Summary
 ======================================================================
 
 Single-kernel two-stage reduction:
-  Stage 1: 20 blocks → 20 partial sums
-  grid.sync() ← All blocks synchronize (KEY innovation)
-  Stage 2: Block 0 → 1 final result
+  Stage 1: 20 blocks -> 20 partial sums
+  grid.sync() <- All blocks synchronize (KEY innovation)
+  Stage 2: Block 0 -> 1 final result
   Total: 1 kernel launch, 137.35 GB/s
 
 Comparison:
   • Traditional: 2 kernel launches or kernel + CPU
   • This sample: 1 kernel with grid.sync() between stages
-  • Benefit: Eliminates ~5-20μs launch overhead per stage
+  • Benefit: Eliminates ~5-20us launch overhead per stage
 
 ======================================================================
 Single-Pass Multi-Block Reduction completed successfully!
diff --git a/python/2_CoreConcepts/reductionMultiBlockCG/reductionMultiBlockCG.py b/python/2_CoreConcepts/reductionMultiBlockCG/reductionMultiBlockCG.py
index bcb5c91a..bded41d8 100644
--- a/python/2_CoreConcepts/reductionMultiBlockCG/reductionMultiBlockCG.py
+++ b/python/2_CoreConcepts/reductionMultiBlockCG/reductionMultiBlockCG.py
@@ -238,8 +238,10 @@ def run(
 
     # Compile kernel
     print("\nCompiling CUDA kernel...")
-    # Support colon-separated multiple include paths
-    include_paths = cuda_include_dir.split(":")
+    # Support multiple include paths separated by the OS path separator
+    # (':' on POSIX, ';' on Windows). os.pathsep avoids splitting Windows
+    # drive prefixes like "C:\..." by accident.
+    include_paths = cuda_include_dir.split(os.pathsep)
     program_options = ProgramOptions(
         std="c++17", arch=f"sm_{device.arch}", include_path=include_paths
     )
@@ -295,7 +297,7 @@ def run(
             grid=(num_blocks, 1, 1),
             block=(threads_per_block, 1, 1),
             shmem_size=shared_mem_bytes,
-            cooperative_launch=True,
+            is_cooperative=True,
         )
 
         n_u32 = np.uint32(num_elements)
@@ -313,7 +315,7 @@ def run(
 
         # Benchmark (CUDA events — not host wall clock around the whole loop)
         print(f"\n> Running benchmark ({test_iterations} iterations)...")
-        event_options = EventOptions(enable_timing=True)
+        event_options = EventOptions(timing_enabled=True)
         start_event = stream.device.create_event(options=event_options)
         end_event = stream.device.create_event(options=event_options)
         # cuda.core event elapsed time (end - start) is in milliseconds (CUDA API).
@@ -362,15 +364,15 @@ def run(
         print("=" * 70)
         print(f"""
 Single-kernel two-stage reduction:
-  Stage 1: {num_blocks} blocks → {num_blocks} partial sums
-  grid.sync() ← All blocks synchronize (KEY innovation)
-  Stage 2: Block 0 → 1 final result
+  Stage 1: {num_blocks} blocks -> {num_blocks} partial sums
+  grid.sync() <- All blocks synchronize (KEY innovation)
+  Stage 2: Block 0 -> 1 final result
   Total: 1 kernel launch, {throughput_gb_s:.2f} GB/s
 
 Comparison:
   • Traditional: 2 kernel launches or kernel + CPU
   • This sample: 1 kernel with grid.sync() between stages
-  • Benefit: Eliminates ~5-20μs launch overhead per stage
+  • Benefit: Eliminates ~5-20us launch overhead per stage
     """)
 
         print("=" * 70)
@@ -430,8 +432,9 @@ def main():
         type=str,
         required=True,
         help=(
-            "CUDA include directory for NVRTC "
-            "(can use colon-separated paths, e.g., /path1:/path2)"
+            "CUDA include directory for NVRTC. "
+            "Use os.pathsep to separate multiple paths "
+            "(':' on POSIX, ';' on Windows)."
         ),
     )
 
diff --git a/python/2_CoreConcepts/reductionMultiBlockCG/requirements.txt b/python/2_CoreConcepts/reductionMultiBlockCG/requirements.txt
index 8d4d1a14..6d8c89fe 100644
--- a/python/2_CoreConcepts/reductionMultiBlockCG/requirements.txt
+++ b/python/2_CoreConcepts/reductionMultiBlockCG/requirements.txt
@@ -2,7 +2,7 @@
 # Install with: pip install -r requirements.txt
 
 numpy>=2.3.2
-cuda-core>=0.6.0
+cuda-core>=1.0.0
 cuda-python>=13.0.0
 # Headers for NVRTC: cooperative_groups.h includes cuda/std/* (CCCL)
 cuda-cccl>=1.0.0
diff --git a/python/2_CoreConcepts/simpleZeroCopy/README.md b/python/2_CoreConcepts/simpleZeroCopy/README.md
index ab2f91cb..cf8a38d3 100644
--- a/python/2_CoreConcepts/simpleZeroCopy/README.md
+++ b/python/2_CoreConcepts/simpleZeroCopy/README.md
@@ -62,7 +62,7 @@ pip install -r requirements.txt
 
 Or manually:
 ```bash
-pip install numpy>=2.3.2 cuda-core>=0.6.0 cuda-python>=13.0.0
+pip install numpy>=2.3.2 cuda-core>=1.0.0 cuda-python>=13.0.0
 ```
 
 ## How to run
diff --git a/python/2_CoreConcepts/simpleZeroCopy/requirements.txt b/python/2_CoreConcepts/simpleZeroCopy/requirements.txt
index e3f69898..cfd9d89e 100644
--- a/python/2_CoreConcepts/simpleZeroCopy/requirements.txt
+++ b/python/2_CoreConcepts/simpleZeroCopy/requirements.txt
@@ -3,4 +3,4 @@
 
 numpy>=2.3.2
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
diff --git a/python/2_CoreConcepts/streamingCopyComputeOverlap/README.md b/python/2_CoreConcepts/streamingCopyComputeOverlap/README.md
index 6d352b61..7ecdc142 100644
--- a/python/2_CoreConcepts/streamingCopyComputeOverlap/README.md
+++ b/python/2_CoreConcepts/streamingCopyComputeOverlap/README.md
@@ -67,7 +67,7 @@ Using pure cuda.core APIs
 ============================================================
 
 Device: NVIDIA GeForce RTX XXXX
-Kernel compiled ✓
+Kernel compiled [OK]
 
 Problem size: 16,000,000 elements (61 MB)
 
diff --git a/python/2_CoreConcepts/streamingCopyComputeOverlap/requirements.txt b/python/2_CoreConcepts/streamingCopyComputeOverlap/requirements.txt
index 15612e66..0b19f485 100644
--- a/python/2_CoreConcepts/streamingCopyComputeOverlap/requirements.txt
+++ b/python/2_CoreConcepts/streamingCopyComputeOverlap/requirements.txt
@@ -2,5 +2,5 @@
 # Requires Python 3.10 or newer
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
 numpy>=2.3.2
diff --git a/python/2_CoreConcepts/streamingCopyComputeOverlap/streamingCopyComputeOverlap.py b/python/2_CoreConcepts/streamingCopyComputeOverlap/streamingCopyComputeOverlap.py
index 665cce9c..b5245629 100644
--- a/python/2_CoreConcepts/streamingCopyComputeOverlap/streamingCopyComputeOverlap.py
+++ b/python/2_CoreConcepts/streamingCopyComputeOverlap/streamingCopyComputeOverlap.py
@@ -102,7 +102,7 @@ def main():
         VECTOR_SCALE_KERNEL, code_type="c++", options=ProgramOptions(arch=arch)
     )
     kernel = program.compile(target_type="cubin").get_kernel("vector_scale")
-    print("Kernel compiled ✓")
+    print("Kernel compiled [OK]")
 
     # Parameters
     N = 16_000_000  # 16M elements
@@ -126,10 +126,10 @@ def main():
     h_in = h_out = d_in = d_out = None
     try:
         # Pre-allocate buffers
-        h_in = pinned_mr.allocate(n_bytes, default_stream)
-        h_out = pinned_mr.allocate(n_bytes, default_stream)
-        d_in = device_mr.allocate(n_bytes, default_stream)
-        d_out = device_mr.allocate(n_bytes, default_stream)
+        h_in = pinned_mr.allocate(n_bytes, stream=default_stream)
+        h_out = pinned_mr.allocate(n_bytes, stream=default_stream)
+        d_in = device_mr.allocate(n_bytes, stream=default_stream)
+        d_out = device_mr.allocate(n_bytes, stream=default_stream)
         # Sync before numpy access (numpy operations aren't stream ordered)
         default_stream.sync()
 
@@ -138,7 +138,7 @@ def main():
         np_in[:] = np.random.rand(N).astype(np.float32) * 100
 
         config = LaunchConfig(grid=((N + 255) // 256,), block=(256,))
-        event_opts = EventOptions(enable_timing=True)
+        event_opts = EventOptions(timing_enabled=True)
 
         # Warm up
         h_in.copy_to(d_in, stream=default_stream)
@@ -214,10 +214,10 @@ def main():
         h_ins, h_outs, d_ins, d_outs = [], [], [], []
         try:
             for i in range(n_streams):
-                h_ins.append(pinned_mr.allocate(chunk_bytes, streams[i]))
-                h_outs.append(pinned_mr.allocate(chunk_bytes, streams[i]))
-                d_ins.append(device_mr.allocate(chunk_bytes, streams[i]))
-                d_outs.append(device_mr.allocate(chunk_bytes, streams[i]))
+                h_ins.append(pinned_mr.allocate(chunk_bytes, stream=streams[i]))
+                h_outs.append(pinned_mr.allocate(chunk_bytes, stream=streams[i]))
+                d_ins.append(device_mr.allocate(chunk_bytes, stream=streams[i]))
+                d_outs.append(device_mr.allocate(chunk_bytes, stream=streams[i]))
 
             # Initialize input data
             for i in range(n_streams):
@@ -245,7 +245,7 @@ def main():
 
             # Benchmark with CUDA events (use stream 0 for timing)
             times = []
-            event_opts = EventOptions(enable_timing=True)
+            event_opts = EventOptions(timing_enabled=True)
             for _ in range(n_runs):
                 start_ev = device.create_event(options=event_opts)
                 end_ev = device.create_event(options=event_opts)
diff --git a/python/2_CoreConcepts/tmaTensorMap/README.md b/python/2_CoreConcepts/tmaTensorMap/README.md
index 1ae7d781..2260a0c4 100644
--- a/python/2_CoreConcepts/tmaTensorMap/README.md
+++ b/python/2_CoreConcepts/tmaTensorMap/README.md
@@ -73,8 +73,8 @@ The sample:
 - CUDA Toolkit 13.0 or newer with libcudacxx (cccl) headers
 - Python 3.10 or newer
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
 
 ## Installation
 
@@ -88,8 +88,8 @@ pip install -r requirements.txt
 The `requirements.txt` installs:
 
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
 
 ## How to Run
 
diff --git a/python/2_CoreConcepts/tmaTensorMap/requirements.txt b/python/2_CoreConcepts/tmaTensorMap/requirements.txt
index c65cd4fb..c33f5dd8 100644
--- a/python/2_CoreConcepts/tmaTensorMap/requirements.txt
+++ b/python/2_CoreConcepts/tmaTensorMap/requirements.txt
@@ -1,4 +1,4 @@
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
 numpy>=1.24.0
diff --git a/python/2_CoreConcepts/tmaTensorMap/tmaTensorMap.py b/python/2_CoreConcepts/tmaTensorMap/tmaTensorMap.py
index 82a38713..adea8ee8 100644
--- a/python/2_CoreConcepts/tmaTensorMap/tmaTensorMap.py
+++ b/python/2_CoreConcepts/tmaTensorMap/tmaTensorMap.py
@@ -64,9 +64,9 @@ try:
         LaunchConfig,
         Program,
         ProgramOptions,
-        StridedMemoryView,
         launch,
     )
+    from cuda.core.utils import StridedMemoryView
     from cuda.pathfinder import find_nvidia_header_directory, get_cuda_path_or_home
     from cuda_samples_utils import print_gpu_info
 except ImportError as e:
@@ -147,11 +147,7 @@ def _get_cccl_include_paths() -> list:
     # CUDA runtime headers - needed for the CUtensorMap driver type.
     try:
         cudart_dir = find_nvidia_header_directory("cudart")
-        if (
-            cudart_dir
-            and os.path.isdir(cudart_dir)
-            and cudart_dir not in include_path
-        ):
+        if cudart_dir and os.path.isdir(cudart_dir) and cudart_dir not in include_path:
             include_path.append(cudart_dir)
     except Exception:  # noqa: S110 - fallback probes continue below
         pass
@@ -232,9 +228,9 @@ def main() -> int:
     output = cp.zeros(n, dtype=cp.float32)
     dev.sync()  # CuPy uses its own stream
 
-    tensor_map = StridedMemoryView.from_any_interface(
-        src, stream_ptr=-1
-    ).as_tensor_map(box_dim=(TILE_SIZE,))
+    tensor_map = StridedMemoryView.from_any_interface(src, stream_ptr=-1).as_tensor_map(
+        box_dim=(TILE_SIZE,)
+    )
 
     n_tiles = n // TILE_SIZE
     config = LaunchConfig(grid=n_tiles, block=TILE_SIZE)
diff --git a/python/3_FrameworkInterop/customPyTorchKernel/README.md b/python/3_FrameworkInterop/customPyTorchKernel/README.md
index 01d11e00..502b3b74 100644
--- a/python/3_FrameworkInterop/customPyTorchKernel/README.md
+++ b/python/3_FrameworkInterop/customPyTorchKernel/README.md
@@ -11,7 +11,7 @@ This sample demonstrates how to add a custom GPU operation to PyTorch using the
 - Python 3.10+
 - PyTorch 2.0+
 - cuda-python >= 13.0.0
-- cuda-core >= 0.6.0
+- cuda-core >=1.0.0
 
 ## Installation
 
@@ -20,6 +20,14 @@ cd python/3_FrameworkInterop/customPyTorchKernel
 pip install -r requirements.txt
 ```
 
+**Windows users:** The default `torch` wheel on PyPI for Windows is CPU-only and will cause `torch.cuda.is_available()` to return `False`. Install a CUDA-enabled build from PyTorch's wheel index *before* (or after) the command above:
+
+```bash
+pip install torch --index-url https://download.pytorch.org/whl/cu128
+```
+
+Replace `cu128` with the wheel suffix matching your installed CUDA driver (e.g. `cu121`, `cu124`, `cu126`, `cu128`). The driver's CUDA version must be >= the wheel's bundled runtime.
+
 ## How to Run
 
 ```bash
diff --git a/python/3_FrameworkInterop/customPyTorchKernel/requirements.txt b/python/3_FrameworkInterop/customPyTorchKernel/requirements.txt
index 6c4cc859..001018fa 100644
--- a/python/3_FrameworkInterop/customPyTorchKernel/requirements.txt
+++ b/python/3_FrameworkInterop/customPyTorchKernel/requirements.txt
@@ -1,5 +1,10 @@
 # Custom PyTorch Kernel Sample Requirements
+#
+# NOTE: On Windows, the default `torch` wheel from PyPI is CPU-only and the
+# sample will fail with "Torch not compiled with CUDA enabled". Install a
+# CUDA-enabled torch from PyTorch's wheel index first (see README.md):
+#   pip install torch --index-url https://download.pytorch.org/whl/cu128
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
 torch>=2.0.0
diff --git a/python/3_FrameworkInterop/customTensorFlowKernel/README.md b/python/3_FrameworkInterop/customTensorFlowKernel/README.md
index 86c934a2..77192b48 100644
--- a/python/3_FrameworkInterop/customTensorFlowKernel/README.md
+++ b/python/3_FrameworkInterop/customTensorFlowKernel/README.md
@@ -22,7 +22,7 @@ Learn how to add a custom GPU operation to TensorFlow using `cuda.core` with `tf
 - Python 3.10+
 - TensorFlow 2.10+
 - cuda-python >= 13.0.0
-- cuda-core >= 0.6.0 (required for LEGACY_DEFAULT_STREAM)
+- cuda-core >=1.0.0 (required for LEGACY_DEFAULT_STREAM)
 - numpy >= 2.3.2
 - CuPy (for device pointer access)
 
diff --git a/python/3_FrameworkInterop/customTensorFlowKernel/requirements.txt b/python/3_FrameworkInterop/customTensorFlowKernel/requirements.txt
index e11eca93..ff4c4229 100644
--- a/python/3_FrameworkInterop/customTensorFlowKernel/requirements.txt
+++ b/python/3_FrameworkInterop/customTensorFlowKernel/requirements.txt
@@ -5,10 +5,10 @@
 # - TensorFlow 2.10+: Deep learning framework (tf.py_function, tf.custom_gradient)
 # - CuPy: Internal helper for device pointer access only
 #
-# Note: cuda-core>=0.6.0 is required for LEGACY_DEFAULT_STREAM constant
+# Note: cuda-core>=1.0.0 is required for LEGACY_DEFAULT_STREAM constant
 
 numpy>=2.3.2
 tensorflow>=2.10.0
-cupy-cuda13x>=13.0.0
+cupy-cuda13x>=14.0.0
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
diff --git a/python/4_DistributedComputing/ipcMemoryPool/README.md b/python/4_DistributedComputing/ipcMemoryPool/README.md
index 6ba35aeb..cfecb531 100644
--- a/python/4_DistributedComputing/ipcMemoryPool/README.md
+++ b/python/4_DistributedComputing/ipcMemoryPool/README.md
@@ -68,8 +68,8 @@ round-trip test:
 - CUDA Toolkit 13.0 or newer (matches `cuda-python` 13.x)
 - Python 3.10 or newer
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
 
 ## Installation
 
@@ -83,8 +83,8 @@ pip install -r requirements.txt
 The `requirements.txt` installs:
 
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
 
 ## How to Run
 
diff --git a/python/4_DistributedComputing/ipcMemoryPool/ipcMemoryPool.py b/python/4_DistributedComputing/ipcMemoryPool/ipcMemoryPool.py
index acac0a90..334139d7 100644
--- a/python/4_DistributedComputing/ipcMemoryPool/ipcMemoryPool.py
+++ b/python/4_DistributedComputing/ipcMemoryPool/ipcMemoryPool.py
@@ -175,7 +175,7 @@ def main() -> int:
         f"(is_ipc_enabled={mr.is_ipc_enabled})"
     )
 
-    buffer = mr.allocate(nbytes)
+    buffer = mr.allocate(nbytes, stream=device.default_stream)
     try:
         # Fill the buffer with a known pattern from the parent side.
         arr = cp.from_dlpack(buffer).view(dtype=cp.float32)
diff --git a/python/4_DistributedComputing/ipcMemoryPool/requirements.txt b/python/4_DistributedComputing/ipcMemoryPool/requirements.txt
index c65cd4fb..c33f5dd8 100644
--- a/python/4_DistributedComputing/ipcMemoryPool/requirements.txt
+++ b/python/4_DistributedComputing/ipcMemoryPool/requirements.txt
@@ -1,4 +1,4 @@
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
 numpy>=1.24.0
diff --git a/python/4_DistributedComputing/multiGPUGradientAverage/README.md b/python/4_DistributedComputing/multiGPUGradientAverage/README.md
index 26aa79d2..c6cfa859 100644
--- a/python/4_DistributedComputing/multiGPUGradientAverage/README.md
+++ b/python/4_DistributedComputing/multiGPUGradientAverage/README.md
@@ -8,7 +8,7 @@ This sample demonstrates gradient averaging across multiple GPUs using MPI and c
 
 - How to initialize MPI for multi-process GPU communication
 - How to map MPI ranks to CUDA devices consistently
-- How to integrate cuda.core streams with CuPy using `ExternalStream`
+- How to integrate cuda.core streams with CuPy using `Stream.from_external`
 - How to compile and launch custom CUDA kernels using cuda.core
 - How to use cuda.core Event for GPU timing measurements
 - How to use MPI Allreduce with host-staging for universal compatibility
@@ -23,12 +23,17 @@ This sample demonstrates gradient averaging across multiple GPUs using MPI and c
 ## Installation
 
 ```bash
-pip install mpi4py cupy-cuda13x cuda-python cuda-core
+pip install -r requirements.txt
 ```
 
 ## Running
 
-**IMPORTANT:** This sample **MUST** be run with `mpirun` with at least 2 processes.
+**IMPORTANT:** This sample **MUST** be launched by an MPI runtime with at
+least 2 processes. On Linux/macOS this is typically `mpirun`; on Windows with
+Microsoft MPI the launcher is `mpiexec` (and the flag for process count is
+`-n`). Either form is accepted by most MPI stacks.
+
+Linux / macOS (OpenMPI, MPICH, Intel MPI):
 
 ```bash
 # Single node (2 GPUs)
@@ -41,6 +46,14 @@ mpirun -np 4 python multiGPUGradientAverage.py --size 10000
 CUDA_VISIBLE_DEVICES=0,2 mpirun -np 2 python multiGPUGradientAverage.py
 ```
 
+Windows (Microsoft MPI — `mpiexec` is installed under
+`C:\Program Files\Microsoft MPI\Bin\` and is not on PATH by default):
+
+```powershell
+& "C:\Program Files\Microsoft MPI\Bin\mpiexec.exe" -n 2 `
+    python multiGPUGradientAverage.py --size 10000
+```
+
 ## Sample Output
 
 ```
@@ -82,11 +95,11 @@ Demo complete.
 
 ## Key Technical Details
 
-The sample uses cuda.core streams and makes CuPy use them via `ExternalStream`:
+The sample uses cuda.core streams and makes CuPy use them via `Stream.from_external`:
 
 ```python
 stream = device.create_stream()
-cp.cuda.ExternalStream(int(stream.handle)).use()
+cp.cuda.Stream.from_external(stream).use()
 ```
 
 GPU timing is measured using cuda.core Event:
@@ -107,4 +120,9 @@ The host-staging pattern transfers data GPU → CPU → MPI → CPU → GPU for
 
 **Error: "This sample requires at least 2 MPI processes!"**
 
-Solution: Run with `mpirun -np 2 python multiGPUGradientAverage.py`
+Solution:
+- Linux / macOS: `mpirun -np 2 python multiGPUGradientAverage.py`
+- Windows (Microsoft MPI): `& "C:\Program Files\Microsoft MPI\Bin\mpiexec.exe" -n 2 python multiGPUGradientAverage.py`
+  (or `mpiexec -n 2 ...` after adding `C:\Program Files\Microsoft MPI\Bin\` to `PATH`).
+
+See the **Running** section above for fully-formed examples.
diff --git a/python/4_DistributedComputing/multiGPUGradientAverage/multiGPUGradientAverage.py b/python/4_DistributedComputing/multiGPUGradientAverage/multiGPUGradientAverage.py
index 02a47672..44cbeb49 100644
--- a/python/4_DistributedComputing/multiGPUGradientAverage/multiGPUGradientAverage.py
+++ b/python/4_DistributedComputing/multiGPUGradientAverage/multiGPUGradientAverage.py
@@ -117,7 +117,7 @@ def init_device(rank: int):
 
     # Create cuda.core stream and make CuPy use it
     stream = device.create_stream()
-    cp.cuda.ExternalStream(int(stream.handle)).use()
+    cp.cuda.Stream.from_external(stream).use()
 
     return device, stream
 
@@ -317,7 +317,7 @@ def main():
 
         # Step 1: Compute local gradients on each GPU
         # Use cuda.core Event for GPU timing measurements
-        timing_options = EventOptions(enable_timing=True)
+        timing_options = EventOptions(timing_enabled=True)
         start_event = stream.record(options=timing_options)
 
         local_grad = compute_local_gradients(num_elements, device, stream, rank)
diff --git a/python/4_DistributedComputing/multiGPUGradientAverage/requirements.txt b/python/4_DistributedComputing/multiGPUGradientAverage/requirements.txt
index f719cfa4..7ff01293 100644
--- a/python/4_DistributedComputing/multiGPUGradientAverage/requirements.txt
+++ b/python/4_DistributedComputing/multiGPUGradientAverage/requirements.txt
@@ -5,14 +5,14 @@ mpi4py>=3.1.4
 
 # GPU array library (NumPy-compatible arrays on CUDA)
 # Use cupy-cuda11x, cupy-cuda12x, or cupy-cuda13x depending on your CUDA version
-cupy-cuda13x>=13.0.0
+cupy-cuda13x>=14.0.0
 
 # CUDA Python bindings (low-level CUDA driver API)
 cuda-python>=13.0.0
 
 # cuda.core - Modern Python interface for CUDA
 # Provides Program, LaunchConfig, Device, and launch APIs
-cuda-core>=0.6.0
+cuda-core>=1.0.0
 
 # Note: This sample uses host-staging for MPI communication
 # Standard MPI installation is sufficient (no CUDA-aware MPI required)
diff --git a/python/4_DistributedComputing/simpleP2P/README.md b/python/4_DistributedComputing/simpleP2P/README.md
index 9f5f01fe..77410907 100644
--- a/python/4_DistributedComputing/simpleP2P/README.md
+++ b/python/4_DistributedComputing/simpleP2P/README.md
@@ -94,7 +94,7 @@ pip install -r requirements.txt
 
 Or manually:
 ```bash
-pip install numpy>=2.3.2 cuda-core>=0.6.0 cuda-python>=13.0.0
+pip install numpy>=2.3.2 cuda-core>=1.0.0 cuda-python>=13.0.0
 ```
 
 ## How to run
diff --git a/python/4_DistributedComputing/simpleP2P/requirements.txt b/python/4_DistributedComputing/simpleP2P/requirements.txt
index d0365796..4626ed5d 100644
--- a/python/4_DistributedComputing/simpleP2P/requirements.txt
+++ b/python/4_DistributedComputing/simpleP2P/requirements.txt
@@ -3,4 +3,4 @@
 
 numpy>=2.3.2
 cuda-python>=13.0.0
-cuda-core>=0.6.0
+cuda-core>=1.0.0
diff --git a/python/4_DistributedComputing/simpleP2P/simpleP2P.py b/python/4_DistributedComputing/simpleP2P/simpleP2P.py
index 534f357a..7f3482d3 100644
--- a/python/4_DistributedComputing/simpleP2P/simpleP2P.py
+++ b/python/4_DistributedComputing/simpleP2P/simpleP2P.py
@@ -161,13 +161,13 @@ def run(num_elements=1024 * 1024 * 16):
     dev0.set_current()
     mr0 = DeviceMemoryResource(dev0)
     mr0.peer_accessible_by = [gpuid[1]]  # Grant GPU 1 access to GPU 0's memory
-    g0 = mr0.allocate(buf_size)
+    g0 = mr0.allocate(buf_size, stream=dev0.default_stream)
 
     # Allocate on GPU 1 and grant access to GPU 0
     dev1.set_current()
     mr1 = DeviceMemoryResource(dev1)
     mr1.peer_accessible_by = [gpuid[0]]  # Grant GPU 0 access to GPU 1's memory
-    g1 = mr1.allocate(buf_size)
+    g1 = mr1.allocate(buf_size, stream=dev1.default_stream)
 
     print(f"  Peer access enabled: GPU{gpuid[0]} <-> GPU{gpuid[1]}")
     print(
@@ -177,7 +177,7 @@ def run(num_elements=1024 * 1024 * 16):
 
     # Allocate pinned host memory
     pinned_mr = PinnedMemoryResource()
-    h0 = pinned_mr.allocate(buf_size)
+    h0 = pinned_mr.allocate(buf_size, stream=dev0.default_stream)
 
     print("  Memory allocated successfully")
 
@@ -190,7 +190,7 @@ def run(num_elements=1024 * 1024 * 16):
         print("\nMeasuring P2P bandwidth...")
         print("  Performing 100 ping-pong copies between GPUs...")
 
-        event_options = EventOptions(enable_timing=True)
+        event_options = EventOptions(timing_enabled=True)
         sync_event0 = None
         sync_event1 = None
 
@@ -206,7 +206,7 @@ def run(num_elements=1024 * 1024 * 16):
                 # Copy g0 -> g1 on stream0
                 g1.copy_from(g0, stream=stream0)
                 # Record event on stream0 to signal completion of this copy
-                sync_event0 = stream0.record(options=EventOptions(enable_timing=False))
+                sync_event0 = stream0.record(options=EventOptions(timing_enabled=False))
             else:
                 # Wait for previous stream0 copy to complete
                 if sync_event0 is not None:
@@ -214,7 +214,7 @@ def run(num_elements=1024 * 1024 * 16):
                 # Copy g1 -> g0 on stream1
                 g0.copy_from(g1, stream=stream1)
                 # Record event on stream1 to signal completion of this copy
-                sync_event1 = stream1.record(options=EventOptions(enable_timing=False))
+                sync_event1 = stream1.record(options=EventOptions(timing_enabled=False))
 
         # Wait for last stream1 copy to complete
         if sync_event1 is not None:
diff --git a/python/Utilities/README.md b/python/Utilities/README.md
index 2596ac56..19c2b47f 100644
--- a/python/Utilities/README.md
+++ b/python/Utilities/README.md
@@ -18,8 +18,8 @@ pip install -r requirements.txt
 This installs a common CUDA 13 stack (see `python/requirements.txt`):
 
 - `cuda-python` (>=13.0.0)
-- `cuda-core` (>=0.6.0)
-- `cupy-cuda13x` (>=13.0.0)
+- `cuda-core` (>=1.0.0)
+- `cupy-cuda13x` (>=14.0.0)
 - `numpy` (>=2.3.2)
 
 ## How to Use in Samples
diff --git a/python/requirements.txt b/python/requirements.txt
index 1895a844..fef8b4d9 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -2,6 +2,6 @@
 # Install sample-specific extras from each sample's requirements.txt (RAPIDS, TensorFlow, etc.).
 
 cuda-python>=13.0.0
-cuda-core>=0.6.0
-cupy-cuda13x>=13.0.0
+cuda-core>=1.0.0
+cupy-cuda13x>=14.0.0
 numpy>=2.3.2
diff --git a/run_tests.py b/run_tests.py
index a292dde8..5270b9b9 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -38,6 +38,9 @@ import threading
 
 print_lock = threading.Lock()
 
+# Exit code for waived tests is 2
+EXIT_WAIVED = 2
+
 def safe_print(*args, **kwargs):
     """Thread-safe print function"""
     with print_lock:
@@ -112,7 +115,12 @@ def run_single_test_instance(executable, args, output_file, global_args, run_des
                 cwd=os.path.dirname(exe_path) # Execute in the executable's directory
             )
 
-        status = "Passed" if result.returncode == 0 else "Failed"
+        if result.returncode == 0:
+            status = "Passed"
+        elif result.returncode == EXIT_WAIVED:
+            status = "Waived"
+        else:
+            status = "Failed"
         safe_print(f"    Finished {exe_name} {run_description}: {status} (code {result.returncode})")
         return {"name": exe_name, "description": run_description, "return_code": result.returncode, "status": status}
 
@@ -258,6 +266,7 @@ def main():
             })
 
     failed = []
+    waived = []
     total_runs = len(tasks)
     completed_runs = 0
 
@@ -278,7 +287,10 @@ def main():
             safe_print(f"Progress: {completed_runs}/{total_runs} runs completed.")
             try:
                 result = future.result()
-                if result["return_code"] != 0:
+                rc = result["return_code"]
+                if rc == EXIT_WAIVED:
+                    waived.append(result)
+                elif rc != 0:
                     failed.append(result)
             except Exception as exc:
                 safe_print(f'Task {task_info["executable"].name} {task_info["description"]} generated an exception: {exc}')
@@ -292,6 +304,10 @@ def main():
     # Print summary
     print("\nTest Summary:")
     print(f"Ran {total_runs} test runs for {len(executables)} executables.")
+    if waived:
+        print(f"Waived runs ({len(waived)}) — hardware/requirements not met (exit {EXIT_WAIVED}), not counted as failure:")
+        for w in waived:
+            print(f"  {w['name']} {w['description']}: {w['status']} (code {w['return_code']})")
     if failed:
         print(f"Failed runs ({len(failed)}):")
         for fail in failed:
@@ -300,7 +316,10 @@ def main():
         first_failure_code = next((f["return_code"] for f in failed if f["return_code"] != -1), 1)
         return first_failure_code
     else:
-        print("All test runs passed!")
+        if waived:
+            print("No failures (waived runs are acceptable).")
+        else:
+            print("All test runs passed!")
         return 0
 
 if __name__ == '__main__':
diff --git a/test_args.json b/test_args.json
index 59eb1efc..c9406561 100644
--- a/test_args.json
+++ b/test_args.json
@@ -371,5 +371,16 @@
     },
     "EGLStream_CUDA_CrossGPU": {
         "min_gpus": 2
+    },
+    "reductionMultiBlockCG": {
+        "python": {
+            "args": ["--cuda-include-dir=$CUDA_HOME/include:$CUDA_HOME/include/cccl"]
+        }
+    },
+    "multiGPUGradientAverage": {
+        "python": {
+            "launcher": ["mpirun", "--allow-run-as-root", "-np", "2"],
+            "args": ["--size", "1024"]
+        }
     }
 }