2025-07-03 06:00:31 +08:00
1375 changed files with 257442 additions and 107569 deletions
--- a/.clang-format
+++ b/.clang-format
@ -1,49 +0,0 @@
---
-AccessModifierOffset: -4
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: Consecutive
-AlignConsecutiveDeclarations: Consecutive
-AlignConsecutiveMacros: Consecutive
-AlignEscapedNewlines: Left
-AlignOperands: AlignAfterOperator
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: false
-BinPackArguments: false
-BinPackParameters: false
-BraceWrapping:
-    AfterClass: true
-    AfterControlStatement: false
-    AfterExternBlock: true
-    AfterFunction: true
-    AfterStruct: true
-    AfterUnion: true
-    BeforeCatch: true
-    BeforeElse: true
-    IndentBraces: false
-BreakBeforeBraces: Custom
-BreakBeforeConceptDeclarations: true
-BreakBeforeBinaryOperators: NonAssignment
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializers: BeforeComma
-BreakInheritanceList: BeforeComma
-ColumnLimit: 120
-DerivePointerAlignment: false
-FixNamespaceComments: true
-IncludeCategories:
-  - Regex:           '^<.*>'
-    Priority:        1
-  - Regex:           '^".*"'
-    Priority:        2
-SortIncludes: true
-IncludeBlocks: Regroup
-IndentWidth: 4
-MaxEmptyLinesToKeep: 2
-PointerAlignment: Right
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: ControlStatements
-Standard: c++17
-TabWidth: 4
-UseTab: Never
-...
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,3 @@
 build
 .vs
 .clangd
-test
-settings.json
-launch.json
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,106 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-ci:
-    autofix_commit_msg: |
-      [pre-commit.ci] auto code formatting
-    autofix_prs: false
-    autoupdate_branch: ''
-    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
-    autoupdate_schedule: quarterly
-    skip: []
-    submodules: false
-
-repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
-    hooks:
-      - id: end-of-file-fixer
-        exclude: |
-          (?x)^(
-            .*\.raw$|
-            .*\.bin$|
-            .*\.dat$|
-            .*\.nv12$|
-            data/.*|
-            Common/.*
-          )
-        files: |
-          (?x)^(
-            .*\.txt$|
-            .*\.md$|
-            .*\.cpp$|
-            .*\.cxx$|
-            .*\.hpp$|
-            .*\.h$|
-            .*\.cu$|
-            .*\.cuh$|
-            .*\.py$|
-            .*\.json$
-          )
-      - id: mixed-line-ending
-        exclude: |
-          (?x)^(
-            .*\.raw$|
-            .*\.bin$|
-            .*\.dat$|
-            .*\.nv12$|
-            data/.*|
-            Common/.*
-          )
-        files: |
-          (?x)^(
-            .*\.txt$|
-            .*\.md$|
-            .*\.cpp$|
-            .*\.cxx$|
-            .*\.hpp$|
-            .*\.h$|
-            .*\.cu$|
-            .*\.cuh$|
-            .*\.py$|
-            .*\.json$
-          )
-      - id: trailing-whitespace
-        exclude: |
-          (?x)^(
-            .*\.raw$|
-            .*\.bin$|
-            .*\.dat$|
-            .*\.nv12$|
-            data/.*|
-            Common/.*
-          )
-        files: |
-          (?x)^(
-            .*\.txt$|
-            .*\.md$|
-            .*\.cpp$|
-            .*\.cxx$|
-            .*\.hpp$|
-            .*\.h$|
-            .*\.cu$|
-            .*\.cuh$|
-            .*\.py$|
-            .*\.json$
-          )
-  - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.6
-    hooks:
-      - id: clang-format
-        types_or: [file]
-        files: |
-          (?x)^(
-            ^.*\.c$|
-            ^.*\.cpp$|
-            ^.*\.cu$|
-            ^.*\.cuh$|
-            ^.*\.cxx$|
-            ^.*\.h$|
-            ^.*\.hpp$|
-            ^.*\.inl$|
-            ^.*\.mm$
-          )
-        exclude: |
-          (?x)^(
-            Common/.*
-          )
-        args: ["-fallback-style=none", "-style=file", "-i"]
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,15 +1,5 @@
 ## Changelog

-### CUDA 12.9
-* Updated toolchain for cross-compilation for Tegra Linux platforms.
-* Added `run_tests.py` utility to exercise all samples. See README.md for details
-* Repository has been updated with consistent code formatting across all samples
-* Many small code tweaks and bug fixes (see commit history for details)
-* Removed the following outdated samples:
-  * `1_Utilities`
-    * `bandwidthTest` - this sample was out of date and did not produce accurate results. For bandwidth
-    testing of NVIDIA GPU platforms, please refer to [NVBandwidth](https://github.com/NVIDIA/nvbandwidth)
-
 ### CUDA 12.8
 * Updated build system across the repository to CMake. Removed Visual Studio project files and Makefiles.
 * Removed the following outdated samples:
@ -46,7 +36,6 @@
    * `cuDLALayerwiseStatsHybrid`
    * `cuDLALayerwiseStatsStandalone`
    * `cuDLAStandaloneMode`
-    * `cudaNvSciBufMultiplanar`
    * `cudaNvSciNvMedia`
    * `fluidsGLES`
    * `nbody_opengles`
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -16,10 +16,8 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,103 +0,0 @@
-
-# Contributing to the CUDA Samples
-
-Thank you for your interest in contributing to the CUDA Samples!
-
-
-## Getting Started
-
-1. **Fork & Clone the Repository**:
-
-   Fork the reporistory and clone the fork. For more information, check [GitHub's documentation on forking](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo) and [cloning a repository](https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/cloning-a-repository).
-
-## Making Changes
-
-1. **Create a New Branch**:
-
-   ```bash
-   git checkout -b your-feature-branch
-   ```
-
-2. **Make Changes**.
-
-3. **Build and Test**:
-
-   Ensure changes don't break existing functionality by building and running tests.
-
-   For more details on building and testing, refer to the [Building and Testing](#building-and-testing) section below.
-
-4. **Commit Changes**:
-
-   ```bash
-   git commit -m "Brief description of the change"
-   ```
-
-## Building and Testing
-
-For information on building a running tests on the samples, please refer to the main [README](README.md)
-
-## Creating a Pull Request
-
-1. Push changes to your fork
-2. Create a pull request targeting the `master` branch of the original CUDA Samples repository. Refer to [GitHub's documentation](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) for more information on creating a pull request.
-3. Describe the purpose and context of the changes in the pull request description.
-
-## Code Formatting (pre-commit hooks)
-
-The CUDA Samples repository uses [pre-commit](https://pre-commit.com/) to execute all code linters and formatters. These
-tools ensure a consistent coding style throughout the project. Using pre-commit ensures that linter
-versions and options are aligned for all developers. Additionally, there is a CI check in place to
-enforce that committed code follows our standards.
-
-The linters used by the CUDA Samples are listed in `.pre-commit-config.yaml`.
-For example, C++ and CUDA code is formatted with [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html).
-
-To use `pre-commit`, install via `conda` or `pip`:
-
-```bash
-conda config --add channels conda-forge
-conda install pre-commit
-```
-
-```bash
-pip install pre-commit
-```
-
-Then run pre-commit hooks before committing code:
-
-```bash
-pre-commit run
-```
-
-By default, pre-commit runs on staged files (only changes and additions that will be committed).
-To run pre-commit checks on all files, execute:
-
-```bash
-pre-commit run --all-files
-```
-
-Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running:
-
-```bash
-pre-commit install
-```
-
-Now code linters and formatters will be run each time you commit changes.
-
-You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`, althoguh please note
-that this may result in pull requests being rejected if subsequent checks fail.
-
-## Review Process
-
-Once submitted, maintainers will be automatically assigned to review the pull request. They might suggest changes or improvements. Constructive feedback is a part of the collaborative process, aimed at ensuring the highest quality code.
-
-For constructive feedback and effective communication during reviews, we recommend following [Conventional Comments](https://conventionalcomments.org/).
-
-Further recommended reading for successful PR reviews:
-
- [How to Do Code Reviews Like a Human (Part One)](https://mtlynch.io/human-code-reviews-1/)
- [How to Do Code Reviews Like a Human (Part Two)](https://mtlynch.io/human-code-reviews-2/)
-
-## Thank You
-
-Your contributions enhance the CUDA Samples for the entire community. We appreciate your effort and collaboration!
--- a/Common/helper_cuda_drvapi.h
+++ b/Common/helper_cuda_drvapi.h
@ -241,7 +241,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
      }

      unsigned long long compute_perf =
-          ((unsigned long long)multiProcessorCount * sm_per_multiproc *
+          (unsigned long long)(multiProcessorCount * sm_per_multiproc *
                               clockRate);

      if (compute_perf > max_compute_perf) {
--- a/Common/nvMatrix.h
+++ b/Common/nvMatrix.h
@ -258,7 +258,7 @@ namespace nv
                s[2] = &r3[0];
                s[3] = &r4[0];

-                int i,j,p,jj;
+                register int i,j,p,jj;

                for (i=0; i<4; i++)
                {
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # CUDA Samples

-Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.9](https://developer.nvidia.com/cuda-downloads).
+Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.6](https://developer.nvidia.com/cuda-downloads).

 ## Release Notes

@ -14,7 +14,7 @@ This section describes the release notes for the CUDA Samples on GitHub only.

 ### Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.8](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).

 ### Getting the CUDA Samples
@ -72,17 +72,6 @@ Open the generated solution file CUDA_Samples.sln in Visual Studio. Build the sa

 Run the samples from the output directories specified in Visual Studio.

-### Enabling On-GPU Debugging
-
-NVIDIA GPUs support on-GPU debugging through cuda-gdb. Enabling this may significantly affect application performance as certain compiler optimizations are disabled
-in this configuration, hence it's not on by default. Enablement of on-device debugging is controlled via the `-G` switch to nvcc.
-
-To enable cuda-gdb for samples builds, define the `ENABLE_CUDA_DEBUG` flag on the CMake command line. For example:
-
-```
-cmake -DENABLE_CUDA_DEBUG=True ...
-```
-
 ### Platform-Specific Samples

 Some CUDA samples are specific to certain platforms, and require passing flags into CMake to enable. In particular, we define the following platform-specific flags:
@ -105,9 +94,9 @@ Navigate to the root of the cloned repository and create a build directory:
 ```
 mkdir build && cd build
 ```
-Configure the project with CMake, specifying the Tegra toolchain file. And you can use -DTARGET_FS to point to the target file system root path for necessary include and library files:
+Configure the project with CMake, specifying the Tegra toolchain file:
 ```
-cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/toolchain-aarch64-linux.cmake -DTARGET_FS=/path/to/target/system/file/system
+cmake .. -DCMAKE_TOOLCHAIN_FILE=/path/to/tegra/toolchain.cmake
 ```
 Build the samples:
 ```
@ -122,7 +111,7 @@ Instead of being in the default location, `/usr/local/cuda/include` or `/usr/loc

 `/usr/local/cuda/<ARCH>/targets/aarch64-linux/lib`
 and
-`/usr/local/cuda/<ARCH>/include`
+`/usr/local/cuda-12.8/<ARCH>/include`

 An example build might look like this:

@ -139,168 +128,6 @@ Note that in the current branch sample cross-compilation for QNX is not fully va
 near future with QNX cross-compilation instructions. In the meantime, if you want to cross-compile for QNX please check out one
 of the previous tags prior to the CMake build system transition in 12.8.

-## Running All Samples as Tests
-
-It's important to note that the CUDA samples are _not_ intended as a validation suite for CUDA. They do not cover corner cases, they do not completely cover the
-runtime and driver APIs, are not intended for performance benchmarking, etc. That said, it can sometimes be useful to run all of the samples as a quick sanity check and
-we provide a script to do so, `run_tests.py`.
-
-This Python3 script finds all executables in a subdirectory you choose, matching application names with command line arguments specified in `test_args.json`. It accepts
-the following command line arguments:
-
-| Switch     | Purpose                                                                                                        | Example                 |
-| ---------- | -------------------------------------------------------------------------------------------------------------- | ----------------------- |
-| --dir      | Specify the root directory to search for executables (recursively)                                             | --dir ./build/Samples   |
-| --config   | JSON configuration file for executable arguments                                                               | --config test_args.json |
-| --output   | Output directory for test results (stdout saved to .txt files - directory will be created if it doesn't exist) | --output ./test         |
-| --args     | Global arguments to pass to all executables (not currently used)                                               | --args arg_1 arg_2 ...  |
-| --parallel | Number of applications to execute in parallel.                                                                 | --parallel 8            |
-
-
-Application configurations are loaded from `test_args.json` and matched against executable names (discarding the `.exe` extension on Windows).
-
-The script returns 0 on success, or the first non-zero error code encountered during testing on failure. It will also print a condensed list of samples that failed, if any.
-
-There are three primary modes of configuration:
-
-**Skip**
-
-An executable configured with "skip" will not be executed. These generally rely on having attached graphical displays and are not suited to this kind of automation.
-
-Configuration example:
-```json
-"fluidsGL": {
-    "skip": true
-}
-```
-
-You will see:
-```
-Skipping fluidsGL (marked as skip in config)
-```
-
-**Single Run**
-
-For executables to run one time only with arguments, specify each argument as a list entry. Each entry in the JSON file will be appended to the command line, separated
-by a space.
-
-All applications execute from their current directory, so all paths are relative to the application's location.
-
-Note that if an application needs no arguments, this entry is optional. An executable found without a matching entry in the JSON will just run as `./application` from its
-current directory.
-
-Configuration example:
-```json
-"ptxgen": {
-    "args": [
-        "test.ll",
-        "-arch=compute_75"
-    ]
-}
-```
-
-You will see:
-```
-Running ptxgen
-    Command: ./ptxgen test.ll -arch=compute_75
-    Test completed with return code 0
-```
-
-**Multiple Runs**
-
-For executables to run multiple times with different command line arguments, specify any number of sets of args within a "runs" list.
-
-As with single runs, all applications execute from their current directory, so all paths are relative to the application's location.
-
-Configuration example:
-```json
-"recursiveGaussian": {
-    "runs": [
-        {
-            "args": [
-                "-sigma=10",
-                "-file=data/ref_10.ppm"
-            ]
-        },
-        {
-            "args": [
-                "-sigma=14",
-                "-file=data/ref_14.ppm"
-            ]
-        },
-        {
-            "args": [
-                "-sigma=18",
-                "-file=data/ref_18.ppm"
-            ]
-        },
-        {
-            "args": [
-                "-sigma=22",
-                "-file=data/ref_22.ppm"
-            ]
-        }
-    ]
-}
-```
-
-You will see:
-```
-Running recursiveGaussian (run 1/4)
-    Command: ./recursiveGaussian -sigma=10 -file=data/ref_10.ppm
-    Test completed with return code 0
-Running recursiveGaussian (run 2/4)
-    Command: ./recursiveGaussian -sigma=14 -file=data/ref_14.ppm
-    Test completed with return code 0
-Running recursiveGaussian (run 3/4)
-    Command: ./recursiveGaussian -sigma=18 -file=data/ref_18.ppm
-    Test completed with return code 0
-Running recursiveGaussian (run 4/4)
-    Command: ./recursiveGaussian -sigma=22 -file=data/ref_22.ppm
-    Test completed with return code 0
-```
-
-### Example Usage
-
-Here is an example set of commands to build and test all of the samples.
-
-First, build:
-```bash
-mkdir build
-cd build
-cmake ..
-make -j$(nproc)
-```
-
-Now, return to the samples root directory and run the test script:
-```bash
-cd ..
-python3 run_tests.py --output ./test --dir ./build/Samples --config test_args.json
-```
-
-If all applications run successfully, you will see something similar to this (the specific number of samples will depend on your build type
-and system configuration):
-
-```
-Test Summary:
-Ran 199 test runs for 180 executables.
-All test runs passed!
-```
-
-If some samples fail, you will see something like this:
-
-```
-Test Summary:
-Ran 199 test runs for 180 executables.
-Failed runs (2):
-  bicubicTexture (run 1/5): Failed (code 1)
-  Mandelbrot (run 1/2): Failed (code 1)
-```
-
-You can inspect the stdout logs in the output directory (generally `APM_<application_name>.txt` or `APM_<application_name>.run<n>.txt`) to help
-determine what may have gone wrong from the output logs. Please file issues against the samples repository if you believe a sample is failing
-incorrectly on your system.
-
 ## Samples list

 ### [0. Introduction](./Samples/0_Introduction/README.md)
@ -343,7 +170,7 @@ These third-party dependencies are required by some CUDA samples. If available,

 FreeImage is an open source imaging library. FreeImage can usually be installed on Linux using your distribution's package manager system. FreeImage can also be downloaded from the FreeImage website.

-To set up FreeImage on a Windows system, extract the FreeImage DLL distribution into the folder `./Common/FreeImage/Dist/x64` such that it contains the .h and .lib files. Copy the .dll file to the Release/ Debug/ execution folder or pass the FreeImage folder when cmake configuring with the `-DFreeImage_INCLUDE_DIR` and `-DFreeImage_LIBRARY` options.
+To set up FreeImage on a Windows system, extract the FreeImage DLL distribution into the folder `../../../Common/FreeImage/Dist/x64` such that it contains the .h and .lib files. Copy the .dll file to the Release/ Debug/ execution folder or pass the FreeImage folder when cmake configuring with the `-DFREEIMAGE_INCLUDE_DIR` and `-DFREEIMAGE_LIBRARY` options.

 #### Message Passing Interface

@ -376,11 +203,11 @@ Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan tar
 #### GLFW
 GLFW is a lightweight, open-source library designed for managing OpenGL, OpenGL ES, and Vulkan contexts. It simplifies the process of creating and managing windows, handling user input (keyboard, mouse, and joystick), and working with multiple monitors in a cross-platform manner.

-To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header folder as `-DGLFW_INCLUDE_DIR` and lib folder as `-DGLFW_LIB_DIR` for cmake configuring.
+To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header as `-DGLFW_INCLUDE_DIR` for cmake configuring and follow the Build_instructions.txt in the sample folder to set up the t.

 #### OpenMP

-OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/). For compilers such as clang, `libomp.so` and other components for LLVM must be installed separated. You will also need to set additional flags in your CMake configuration files, such as: `-DOpenMP_CXX_FLAGS="-fopenmp=libomp" -DOpenMP_CXX_LIB_NAMES="omp" -DOpenMP_omp_LIBRARY="/path/to/libomp.so"`.
+OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/).

 #### Screen

--- a/Samples/0_Introduction/CMakeLists.txt
+++ b/Samples/0_Introduction/CMakeLists.txt
@ -1,3 +1,20 @@
+cmake_minimum_required(VERSION 3.20)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
+
+project(simpleCallback LANGUAGES C CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
+endif()
+
+
 add_subdirectory(UnifiedMemoryStreams)
 add_subdirectory(asyncAPI)
 add_subdirectory(clock)
@ -38,7 +55,6 @@ add_subdirectory(simpleTexture3D)
 add_subdirectory(simpleTextureDrv)
 add_subdirectory(simpleVoteIntrinsics)
 add_subdirectory(simpleZeroCopy)
-add_subdirectory(template)
 add_subdirectory(systemWideAtomics)
 add_subdirectory(vectorAdd)
 add_subdirectory(vectorAddDrv)
--- a/Samples/0_Introduction/UnifiedMemoryStreams/CMakeLists.txt
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/CMakeLists.txt
@ -10,21 +10,15 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
 include_directories(../../../Common)

 # Source file
-if(CMAKE_GENERATOR MATCHES "Visual Studio")
-    find_package(OpenMP REQUIRED C CXX)
-else()
 find_package(OpenMP REQUIRED)
-endif()

 if(${OpenMP_FOUND})
    # Add target for UnifiedMemoryStreams
--- a/Samples/0_Introduction/UnifiedMemoryStreams/README.md
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/README.md
@ -28,7 +28,7 @@ cudaStreamDestroy, cudaFree, cudaMallocManaged, cudaStreamAttachMemAsync, cudaSe

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu
@ -31,10 +31,10 @@
 */

 // system includes
-#include <algorithm>
 #include <cstdio>
 #include <ctime>
 #include <vector>
+#include <algorithm>
 #ifdef USE_PTHREADS
 #include <pthread.h>
 #else
@ -58,25 +58,15 @@ double drand48() { return double(rand()) / RAND_MAX; }
 const char *sSDKname = "UnifiedMemoryStreams";

 // simple task
-template <typename T> struct Task
-{
+template <typename T>
+struct Task {
  unsigned int size, id;
  T *data;
  T *result;
  T *vector;

-    Task()
-        : size(0)
-        , id(0)
-        , data(NULL)
-        , result(NULL)
-        , vector(NULL) {};
-    Task(unsigned int s)
-        : size(s)
-        , id(0)
-        , data(NULL)
-        , result(NULL)
-    {
+  Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){};
+  Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) {
    // allocate unified memory -- the operation performed in this example will
    // be a DGEMV
    checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
@ -85,8 +75,7 @@ template <typename T> struct Task
    checkCudaErrors(cudaDeviceSynchronize());
  }

-    ~Task()
-    {
+  ~Task() {
    // ensure all memory is deallocated
    checkCudaErrors(cudaDeviceSynchronize());
    checkCudaErrors(cudaFree(data));
@ -94,8 +83,7 @@ template <typename T> struct Task
    checkCudaErrors(cudaFree(vector));
  }

-    void allocate(const unsigned int s, const unsigned int unique_id)
-    {
+  void allocate(const unsigned int s, const unsigned int unique_id) {
    // allocate unified memory outside of constructor
    id = unique_id;
    size = s;
@ -117,8 +105,7 @@ template <typename T> struct Task
 };

 #ifdef USE_PTHREADS
-struct threadData_t
-{
+struct threadData_t {
  int tid;
  Task<double> *TaskListPtr;
  cudaStream_t *streams;
@ -130,8 +117,8 @@ typedef struct threadData_t threadData;
 #endif

 // simple host dgemv: assume data is in row-major format and square
-template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
-{
+template <typename T>
+void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
  // rows
  for (int i = 0; i < n; i++) {
    result[i] *= beta;
@ -144,8 +131,7 @@ template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *re

 // execute a single task on either host or device depending on size
 #ifdef USE_PTHREADS
-void *execute(void *inpArgs)
-{
+void *execute(void *inpArgs) {
  threadData *dataPtr = (threadData *)inpArgs;
  cudaStream_t *stream = dataPtr->streams;
  cublasHandle_t *handle = dataPtr->handles;
@ -156,75 +142,92 @@ void *execute(void *inpArgs)

    if (t.size < 100) {
      // perform on host
-            printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
+      printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
+             t.size);

      // attach managed memory to a (dummy) stream to allow host access while
      // the device is running
-            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
-            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
-            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
+      checkCudaErrors(
+          cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
+      checkCudaErrors(
+          cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
+      checkCudaErrors(
+          cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
      // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
      checkCudaErrors(cudaStreamSynchronize(stream[0]));
      // call the host operation
      gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
-        }
-        else {
+    } else {
      // perform on device
-            printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
+      printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
+             t.size);
      double one = 1.0;
      double zero = 0.0;

      // attach managed memory to my stream
      checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
-            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
-            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
-            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
+      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
+                                               cudaMemAttachSingle));
+      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
+                                               cudaMemAttachSingle));
+      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
+                                               cudaMemAttachSingle));
      // call the device operation
-            checkCudaErrors(cublasDgemv(
-                handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
+      checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
+                                  &one, t.data, t.size, t.vector, 1, &zero,
+                                  t.result, 1));
    }
  }

  pthread_exit(NULL);
 }
 #else
-template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
-{
+template <typename T>
+void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream,
+             int tid) {
  if (t.size < 100) {
    // perform on host
-        printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
+    printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
+           t.size);

    // attach managed memory to a (dummy) stream to allow host access while the
    // device is running
-        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
-        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
-        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
+    checkCudaErrors(
+        cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
+    checkCudaErrors(
+        cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
+    checkCudaErrors(
+        cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
    // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
    checkCudaErrors(cudaStreamSynchronize(stream[0]));
    // call the host operation
    gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
-    }
-    else {
+  } else {
    // perform on device
-        printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
+    printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
+           t.size);
    double one = 1.0;
    double zero = 0.0;

    // attach managed memory to my stream
    checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
-        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
-        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
-        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
+    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
+                                             cudaMemAttachSingle));
+    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
+                                             cudaMemAttachSingle));
+    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
+                                             cudaMemAttachSingle));
    // call the device operation
-        checkCudaErrors(cublasDgemv(
-            handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
+    checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
+                                &one, t.data, t.size, t.vector, 1, &zero,
+                                t.result, 1));
  }
 }
 #endif

 // populate a list of tasks with random sizes
-template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
-{
+template <typename T>
+void initialise_tasks(std::vector<Task<T> > &TaskList) {
  for (unsigned int i = 0; i < TaskList.size(); i++) {
    // generate random size
    int size;
@ -233,8 +236,7 @@ template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
  }
 }

-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  // set device
  cudaDeviceProp device_prop;
  int dev_id = findCudaDevice(argc, (const char **)argv);
@ -292,17 +294,19 @@ int main(int argc, char **argv)

    if ((TaskList.size() / nthreads) == 0) {
      InputToThreads[i].taskSize = (TaskList.size() / nthreads);
-            InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
-        }
-        else {
-            if (i == nthreads - 1) {
-                InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
      InputToThreads[i].TaskListPtr =
-                    &TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
-            }
-            else {
+          &TaskList[i * (TaskList.size() / nthreads)];
+    } else {
+      if (i == nthreads - 1) {
+        InputToThreads[i].taskSize =
+            (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
+        InputToThreads[i].TaskListPtr =
+            &TaskList[i * (TaskList.size() / nthreads) +
+                      (TaskList.size() % nthreads)];
+      } else {
        InputToThreads[i].taskSize = (TaskList.size() / nthreads);
-                InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
+        InputToThreads[i].TaskListPtr =
+            &TaskList[i * (TaskList.size() / nthreads)];
      }
    }

--- a/Samples/0_Introduction/asyncAPI/CMakeLists.txt
+++ b/Samples/0_Introduction/asyncAPI/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/asyncAPI/README.md
+++ b/Samples/0_Introduction/asyncAPI/README.md
@ -27,6 +27,6 @@ cudaProfilerStop, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaMallocHost, cudaPro

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/asyncAPI/asyncAPI.cu
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI.cu
@ -38,21 +38,19 @@
 #include <stdio.h>

 // includes CUDA Runtime
-#include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
+#include <cuda_profiler_api.h>

 // includes, project
 #include <helper_cuda.h>
 #include <helper_functions.h>  // helper utility functions

-__global__ void increment_kernel(int *g_data, int inc_value)
-{
+__global__ void increment_kernel(int *g_data, int inc_value) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  g_data[idx] = g_data[idx] + inc_value;
 }

-bool correct_output(int *data, const int n, const int x)
-{
+bool correct_output(int *data, const int n, const int x) {
  for (int i = 0; i < n; i++)
    if (data[i] != x) {
      printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
@ -62,8 +60,7 @@ bool correct_output(int *data, const int n, const int x)
  return true;
 }

-int main(int argc, char *argv[])
-{
+int main(int argc, char *argv[]) {
  int devID;
  cudaDeviceProp deviceProps;

@ -129,7 +126,8 @@ int main(int argc, char *argv[])
  // print the cpu and gpu times
  printf("time spent executing by the GPU: %.2f\n", gpu_time);
  printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
-    printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
+  printf("CPU executed %lu iterations while waiting for GPU to finish\n",
+         counter);

  // check the output for correctness
  bool bFinalResults = correct_output(a, n, value);
--- a/Samples/0_Introduction/clock/CMakeLists.txt
+++ b/Samples/0_Introduction/clock/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/clock/README.md
+++ b/Samples/0_Introduction/clock/README.md
@ -27,6 +27,6 @@ cudaMalloc, cudaMemcpy, cudaFree

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/clock/clock.cu
+++ b/Samples/0_Introduction/clock/clock.cu
@ -48,16 +48,15 @@
 // This kernel computes a standard parallel reduction and evaluates the
 // time it takes to do that for each block. The timing results are stored
 // in device memory.
-__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
-{
+__global__ static void timedReduction(const float *input, float *output,
+                                      clock_t *timer) {
  // __shared__ float shared[2 * blockDim.x];
  extern __shared__ float shared[];

  const int tid = threadIdx.x;
  const int bid = blockIdx.x;

-    if (tid == 0)
-        timer[bid] = clock();
+  if (tid == 0) timer[bid] = clock();

  // Copy input.
  shared[tid] = input[tid];
@ -78,13 +77,11 @@ __global__ static void timedReduction(const float *input, float *output, clock_t
  }

  // Write result.
-    if (tid == 0)
-        output[bid] = shared[0];
+  if (tid == 0) output[bid] = shared[0];

  __syncthreads();

-    if (tid == 0)
-        timer[bid + gridDim.x] = clock();
+  if (tid == 0) timer[bid + gridDim.x] = clock();
 }

 #define NUM_BLOCKS 64
@ -107,8 +104,7 @@ __global__ static void timedReduction(const float *input, float *output, clock_t
 // the memory. With more than 32 the speed scales linearly.

 // Start the main CUDA Sample here
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  printf("CUDA Clock sample\n");

  // This will pick the best possible CUDA capable device
@ -125,15 +121,20 @@ int main(int argc, char **argv)
    input[i] = (float)i;
  }

-    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
+  checkCudaErrors(
+      cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
  checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
-    checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
+  checkCudaErrors(
+      cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));

-    checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2,
+                             cudaMemcpyHostToDevice));

-    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
+  timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(
+      dinput, doutput, dtimer);

-    checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2,
+                             cudaMemcpyDeviceToHost));

  checkCudaErrors(cudaFree(dinput));
  checkCudaErrors(cudaFree(doutput));
--- a/Samples/0_Introduction/clock_nvrtc/CMakeLists.txt
+++ b/Samples/0_Introduction/clock_nvrtc/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/clock_nvrtc/README.md
+++ b/Samples/0_Introduction/clock_nvrtc/README.md
@ -33,7 +33,7 @@ cudaBlockSize, cudaGridSize

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
--- a/Samples/0_Introduction/clock_nvrtc/clock.cpp
+++ b/Samples/0_Introduction/clock_nvrtc/clock.cpp
@ -34,11 +34,12 @@
 */

 // System includes
+#include <stdio.h>
+#include <stdint.h>
 #include <assert.h>
+
 #include <cuda_runtime.h>
 #include <nvrtc_helper.h>
-#include <stdint.h>
-#include <stdio.h>

 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
@ -70,8 +71,7 @@

 // Start the main CUDA Sample here

-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  printf("CUDA Clock sample\n");

  typedef long clock_t;
@ -106,20 +106,17 @@ int main(int argc, char **argv)

  void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};

-    checkCudaErrors(cuLaunchKernel(kernel_addr,
-                                   cudaGridSize.x,
-                                   cudaGridSize.y,
+  checkCudaErrors(cuLaunchKernel(
+      kernel_addr, cudaGridSize.x, cudaGridSize.y,
      cudaGridSize.z,                                    /* grid dim */
-                                   cudaBlockSize.x,
-                                   cudaBlockSize.y,
-                                   cudaBlockSize.z, /* block dim */
-                                   sizeof(float) * 2 * NUM_THREADS,
-                                   0,       /* shared mem, stream */
+      cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
+      sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */
      &arr[0],                            /* arguments */
      0));

  checkCudaErrors(cuCtxSynchronize());
-    checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
+  checkCudaErrors(
+      cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
  checkCudaErrors(cuMemFree(dinput));
  checkCudaErrors(cuMemFree(doutput));
  checkCudaErrors(cuMemFree(dtimer));
--- a/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu
+++ b/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu
@ -37,16 +37,15 @@
 // time it takes to do that for each block. The timing results are stored
 // in device memory.

-extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
-{
+extern "C" __global__ void timedReduction(const float *input, float *output,
+                                          clock_t *timer) {
  // __shared__ float shared[2 * blockDim.x];
  extern __shared__ float shared[];

  const int tid = threadIdx.x;
  const int bid = blockIdx.x;

-    if (tid == 0)
-        timer[bid] = clock();
+  if (tid == 0) timer[bid] = clock();

  // Copy input.
  shared[tid] = input[tid];
@ -67,11 +66,9 @@ extern "C" __global__ void timedReduction(const float *input, float *output, clo
  }

  // Write result.
-    if (tid == 0)
-        output[bid] = shared[0];
+  if (tid == 0) output[bid] = shared[0];

  __syncthreads();

-    if (tid == 0)
-        timer[bid + gridDim.x] = clock();
+  if (tid == 0) timer[bid + gridDim.x] = clock();
 }
--- a/Samples/0_Introduction/cudaOpenMP/CMakeLists.txt
+++ b/Samples/0_Introduction/cudaOpenMP/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/cudaOpenMP/README.md
+++ b/Samples/0_Introduction/cudaOpenMP/README.md
@ -30,7 +30,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaGetLastError, cudaSetDevice, cudaG

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
--- a/Samples/0_Introduction/cudaOpenMP/cudaOpenMP.cu
+++ b/Samples/0_Introduction/cudaOpenMP/cudaOpenMP.cu
@ -37,24 +37,20 @@
 using namespace std;

 // a simple kernel that simply increments each array element by b
-__global__ void kernelAddConstant(int *g_a, const int b)
-{
+__global__ void kernelAddConstant(int *g_a, const int b) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  g_a[idx] += b;
 }

 // a predicate that checks whether each array element is set to its index plus b
-int correctResult(int *data, const int n, const int b)
-{
+int correctResult(int *data, const int n, const int b) {
  for (int i = 0; i < n; i++)
-        if (data[i] != i + b)
-            return 0;
+    if (data[i] != i + b) return 0;

  return 1;
 }

-int main(int argc, char *argv[])
-{
+int main(int argc, char *argv[]) {
  int num_gpus = 0;  // number of CUDA GPUs

  printf("%s Starting...\n\n", argv[0]);
@ -97,8 +93,7 @@ int main(int argc, char *argv[])
    return 1;
  }

-    for (unsigned int i = 0; i < n; i++)
-        a[i] = i;
+  for (unsigned int i = 0; i < n; i++) a[i] = i;

  ////////////////////////////////////////////////////////////////
  // run as many CPU threads as there are CUDA devices
@ -110,7 +105,8 @@ int main(int argc, char *argv[])
  //   Recall that all variables declared inside an "omp parallel" scope are
  //   local to each CPU thread
  //
-    omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
+  omp_set_num_threads(
+      num_gpus);  // create as many CPU threads as there are CUDA devices
 // omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
 // are CUDA devices
 #pragma omp parallel
@ -120,23 +116,31 @@ int main(int argc, char *argv[])

    // set and check the CUDA device for this CPU thread
    int gpu_id = -1;
-        checkCudaErrors(
-            cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
+    checkCudaErrors(cudaSetDevice(
+        cpu_thread_id %
+        num_gpus));  // "% num_gpus" allows more CPU threads than GPU devices
    checkCudaErrors(cudaGetDevice(&gpu_id));
-        printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
+    printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,
+           num_cpu_threads, gpu_id);

-        int         *d_a   = 0; // pointer to memory on the device associated with this CPU thread
-        int         *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
+    int *d_a =
+        0;  // pointer to memory on the device associated with this CPU thread
+    int *sub_a =
+        a +
+        cpu_thread_id * n /
+            num_cpu_threads;  // pointer to this CPU thread's portion of data
    unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
    dim3 gpu_threads(128);  // 128 threads per block
    dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));

    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
    checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
-        checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
+    checkCudaErrors(
+        cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
    kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);

-        checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
+    checkCudaErrors(
+        cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaFree(d_a));
  }
  printf("---------------------------\n");
@ -149,8 +153,7 @@ int main(int argc, char *argv[])
  //
  bool bResult = correctResult(a, n, b);

-    if (a)
-        free(a); // free CPU memory
+  if (a) free(a);  // free CPU memory

  exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/fp16ScalarProduct/CMakeLists.txt
+++ b/Samples/0_Introduction/fp16ScalarProduct/CMakeLists.txt
@ -9,10 +9,8 @@ find_package(CUDAToolkit REQUIRED)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 60 61 70 72 75 80 86 87 89 90 100 101 120)
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/fp16ScalarProduct/README.md
+++ b/Samples/0_Introduction/fp16ScalarProduct/README.md
@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaMallocHost, cudaFreeHost, cudaMalloc, cudaGetDevicePro

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
--- a/Samples/0_Introduction/fp16ScalarProduct/fp16ScalarProduct.cu
+++ b/Samples/0_Introduction/fp16ScalarProduct/fp16ScalarProduct.cu
@ -25,18 +25,17 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

+#include "cuda_fp16.h"
+#include "helper_cuda.h"
+
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>

-#include "cuda_fp16.h"
-#include "helper_cuda.h"
-
 #define NUM_OF_BLOCKS 128
 #define NUM_OF_THREADS 128

-__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
-{
+__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
  if (threadIdx.x < 64)
    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
  __syncthreads();
@ -60,34 +59,27 @@ __forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
  __syncthreads();
 }

-__forceinline__ __device__ void reduceInShared_native(half2 *const v)
-{
-    if (threadIdx.x < 64)
-        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
+__forceinline__ __device__ void reduceInShared_native(half2 *const v) {
+  if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
  __syncthreads();
-    if (threadIdx.x < 32)
-        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
+  if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
  __syncthreads();
-    if (threadIdx.x < 16)
-        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
+  if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
  __syncthreads();
-    if (threadIdx.x < 8)
-        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
+  if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
  __syncthreads();
-    if (threadIdx.x < 4)
-        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
+  if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
  __syncthreads();
-    if (threadIdx.x < 2)
-        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
+  if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
  __syncthreads();
-    if (threadIdx.x < 1)
-        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
+  if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
  __syncthreads();
 }

-__global__ void
-scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size)
-{
+__global__ void scalarProductKernel_intrinsics(half2 const *const a,
+                                               half2 const *const b,
+                                               float *const results,
+                                               size_t const size) {
  const int stride = gridDim.x * blockDim.x;
  __shared__ half2 shArray[NUM_OF_THREADS];

@ -109,9 +101,10 @@ scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float
  }
 }

-__global__ void
-scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size)
-{
+__global__ void scalarProductKernel_native(half2 const *const a,
+                                           half2 const *const b,
+                                           float *const results,
+                                           size_t const size) {
  const int stride = gridDim.x * blockDim.x;
  __shared__ half2 shArray[NUM_OF_THREADS];

@ -133,8 +126,7 @@ scalarProductKernel_native(half2 const *const a, half2 const *const b, float *co
  }
 }

-void generateInput(half2 *a, size_t size)
-{
+void generateInput(half2 *a, size_t size) {
  for (size_t i = 0; i < size; ++i) {
    half2 temp;
    temp.x = static_cast<float>(rand() % 4);
@ -143,8 +135,7 @@ void generateInput(half2 *a, size_t size)
  }
 }

-int main(int argc, char *argv[])
-{
+int main(int argc, char *argv[]) {
  srand((unsigned int)time(NULL));
  size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;

@ -160,7 +151,8 @@ int main(int argc, char *argv[])
  checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));

  if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
-        printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
+    printf(
+        "ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
        "higher.\n");
    return EXIT_WAIVED;
  }
@ -170,17 +162,23 @@ int main(int argc, char *argv[])
    checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
  }

-    checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
-    checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
+  checkCudaErrors(
+      cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
+  checkCudaErrors(
+      cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));

  for (int i = 0; i < 2; ++i) {
    generateInput(vec[i], size);
-        checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i],
+                               cudaMemcpyHostToDevice));
  }

-    scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
+  scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
+      devVec[0], devVec[1], devResults, size);

-    checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(results, devResults,
+                             NUM_OF_BLOCKS * sizeof *results,
+                             cudaMemcpyDeviceToHost));

  float result_native = 0;
  for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
@ -188,9 +186,12 @@ int main(int argc, char *argv[])
  }
  printf("Result native operators\t: %f \n", result_native);

-    scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
+  scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
+      devVec[0], devVec[1], devResults, size);

-    checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(results, devResults,
+                             NUM_OF_BLOCKS * sizeof *results,
+                             cudaMemcpyDeviceToHost));

  float result_intrinsics = 0;
  for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
@ -198,7 +199,9 @@ int main(int argc, char *argv[])
  }
  printf("Result intrinsics\t: %f \n", result_intrinsics);

-    printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED");
+  printf("&&&& fp16ScalarProduct %s\n",
+         (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED"
+                                                             : "FAILED");

  for (int i = 0; i < 2; ++i) {
    checkCudaErrors(cudaFree(devVec[i]));
--- a/Samples/0_Introduction/matrixMul/CMakeLists.txt
+++ b/Samples/0_Introduction/matrixMul/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/matrixMul/README.md
+++ b/Samples/0_Introduction/matrixMul/README.md
@ -2,7 +2,7 @@

 ## Description

-This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication.  To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
+This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication.  To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.

 ## Key Concepts

@ -27,6 +27,6 @@ cudaStreamCreateWithFlags, cudaProfilerStop, cudaMalloc, cudaFree, cudaMallocHos

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/matrixMul/matrixMul.cu
+++ b/Samples/0_Introduction/matrixMul/matrixMul.cu
@ -40,23 +40,24 @@
 */

 // System includes
-#include <assert.h>
 #include <stdio.h>
+#include <assert.h>

 // CUDA runtime
-#include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
+#include <cuda_profiler_api.h>

 // Helper functions and utilities to work with CUDA
-#include <helper_cuda.h>
 #include <helper_functions.h>
+#include <helper_cuda.h>

 /**
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 * wA is A's width and wB is B's width
 */
-template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
-{
+template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
+    float *B, int wA,
+    int wB) {
  // Block index
  int bx = blockIdx.x;
  int by = blockIdx.y;
@ -86,7 +87,9 @@ template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, floa

  // Loop over all the sub-matrices of A and B
  // required to compute the block sub-matrix
-    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+  for (int a = aBegin, b = bBegin;
+       a <= aEnd;
+       a += aStep, b += bStep) {
    // Declaration of the shared memory array As used to
    // store the sub-matrix of A
    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
@ -125,8 +128,7 @@ template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, floa
  C[c + wB * ty + tx] = Csub;
 }

-void ConstantInit(float *data, int size, float val)
-{
+void ConstantInit(float *data, int size, float val) {
  for (int i = 0; i < size; ++i) {
    data[i] = val;
  }
@ -135,8 +137,9 @@ void ConstantInit(float *data, int size, float val)
 /**
 * Run a simple test of matrix multiplication using CUDA
 */
-int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB)
-{
+int MatrixMultiply(int argc, char **argv,
+                   int block_size, const dim3 &dimsA,
+                   const dim3 &dimsB) {
  // Allocate host memory for matrices A and B
  unsigned int size_A = dimsA.x * dimsA.y;
  unsigned int mem_size_A = sizeof(float) * size_A;
@ -178,8 +181,10 @@ int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, con
  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

  // copy host memory to device
-    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(
+      cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(
+      cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));

  // Setup execution parameters
  dim3 threads(block_size, block_size);
@ -190,10 +195,11 @@ int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, con

  // Performs warmup operation using matrixMul CUDA kernel
  if (block_size == 16) {
-        MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-    }
-    else {
-        MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+    MatrixMulCUDA<16>
+        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+  } else {
+    MatrixMulCUDA<32>
+        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
  }

  printf("done\n");
@ -207,10 +213,11 @@ int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, con

  for (int j = 0; j < nIter; j++) {
    if (block_size == 16) {
-            MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-        }
-        else {
-            MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+      MatrixMulCUDA<16>
+          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+    } else {
+      MatrixMulCUDA<32>
+          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
    }
  }

@ -225,18 +232,19 @@ int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, con

  // Compute and print the performance
  float msecPerMatrixMul = msecTotal / nIter;
-    double flopsPerMatrixMul =
-        2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
-    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
-    printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
+  double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
+                             static_cast<double>(dimsA.y) *
+                             static_cast<double>(dimsB.x);
+  double gigaFlops =
+      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
+  printf(
+      "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
      " WorkgroupSize= %u threads/block\n",
-           gigaFlops,
-           msecPerMatrixMul,
-           flopsPerMatrixMul,
-           threads.x * threads.y);
+      gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);

  // Copy result from device to host
-    checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
+  checkCudaErrors(
+      cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
  checkCudaErrors(cudaStreamSynchronize(stream));

  printf("Checking computed result for correctness: ");
@ -253,7 +261,8 @@ int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, con
    double rel_err = abs_err / abs_val / dot_length;

    if (rel_err > eps) {
-            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
+      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
+             i, h_C[i], dimsA.x * valB, eps);
      correct = false;
    }
  }
@ -269,13 +278,13 @@ int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, con
  checkCudaErrors(cudaFree(d_C));
  checkCudaErrors(cudaEventDestroy(start));
  checkCudaErrors(cudaEventDestroy(stop));
-    printf("\nNOTE: The CUDA Samples are not meant for performance "
+  printf(
+      "\nNOTE: The CUDA Samples are not meant for performance "
      "measurements. Results may vary when GPU Boost is enabled.\n");

  if (correct) {
    return EXIT_SUCCESS;
-    }
-    else {
+  } else {
    return EXIT_FAILURE;
  }
 }
@ -284,15 +293,15 @@ int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, con
 /**
 * Program main
 */
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  printf("[Matrix Multiply Using CUDA] - Starting...\n");

-    if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
+  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
+      checkCmdLineFlag(argc, (const char **)argv, "?")) {
    printf("Usage -device=n (n >= 0 for deviceID)\n");
    printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
    printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
-        printf("  Note: Outer matrix dimensions of A & B matrices"
+    printf("  Note: Outer matrix dimensions of A & B matrices" \
           " must be equal.\n");

    exit(EXIT_SUCCESS);
@ -328,11 +337,13 @@ int main(int argc, char **argv)
  }

  if (dimsA.x != dimsB.y) {
-        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
+    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
+           dimsA.x, dimsB.y);
    exit(EXIT_FAILURE);
  }

-    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
+  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
+         dimsB.x, dimsB.y);

  checkCudaErrors(cudaProfilerStart());
  int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
--- a/Samples/0_Introduction/matrixMulDrv/CMakeLists.txt
+++ b/Samples/0_Introduction/matrixMulDrv/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
@ -40,12 +38,6 @@ target_link_libraries(matrixMulDrv PUBLIC
 set(CUDA_FATBIN_FILE "${CMAKE_CURRENT_BINARY_DIR}/matrixMul_kernel64.fatbin")
 set(CUDA_KERNEL_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/matrixMul_kernel.cu")

-# Construct GENCODE_FLAGS explicitly from CUDA architectures
-set(GENCODE_FLAGS "")
-foreach(arch ${CMAKE_CUDA_ARCHITECTURES})
-    list(APPEND GENCODE_FLAGS "-gencode=arch=compute_${arch},code=sm_${arch}")
-endforeach()
-
 add_custom_command(
    OUTPUT ${CUDA_FATBIN_FILE}
    COMMAND ${CMAKE_CUDA_COMPILER} ${INCLUDES} ${ALL_CCFLAGS} -Wno-deprecated-gpu-targets  ${GENCODE_FLAGS} -o ${CUDA_FATBIN_FILE} -fatbin ${CUDA_KERNEL_SOURCE}
--- a/Samples/0_Introduction/matrixMulDrv/README.md
+++ b/Samples/0_Introduction/matrixMulDrv/README.md
@ -27,6 +27,6 @@ cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuDeviceGetName, cuDeviceTotalMem, c

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp
+++ b/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp
@ -46,23 +46,23 @@

 // includes, system
 #include <builtin_types.h>
-#include <cstring>
-#include <iostream>
 #include <math.h>
-#include <stdio.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <string.h>
+#include <iostream>
+#include <cstring>

 // includes, project, CUDA
-#include <cstring>
 #include <cuda.h>
 #include <helper_cuda_drvapi.h>
 #include <helper_image.h>
 #include <helper_string.h>
 #include <helper_timer.h>
+
+#include <cstring>
 #include <iostream>
 #include <string>
-
 #include "matrixMul.h"


@ -71,9 +71,11 @@
 void runTest(int argc, char **argv);
 void randomInit(float *, int);

-extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
+extern "C" void computeGold(float *, const float *, const float *, unsigned int,
+                            unsigned int, unsigned int);

-static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size);
+static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
+                    int *blk_size);

 #ifndef FATBIN_FILE
 #define FATBIN_FILE "matrixMul_kernel64.fatbin"
@ -89,8 +91,7 @@ size_t    totalGlobalMem;

 const char *sSDKsample = "matrixMulDrv (Driver API)";

-void constantInit(float *data, int size, float val)
-{
+void constantInit(float *data, int size, float val) {
  for (int i = 0; i < size; ++i) {
    data[i] = val;
  }
@ -99,8 +100,7 @@ void constantInit(float *data, int size, float val)
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  printf("[ %s ]\n", sSDKsample);

  runTest(argc, argv);
@ -109,8 +109,7 @@ int main(int argc, char **argv)
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv)
-{
+void runTest(int argc, char **argv) {
  // initialize CUDA
  CUfunction matrixMul = NULL;
  int block_size = 0;
@ -173,19 +172,10 @@ void runTest(int argc, char **argv)
    size_t Matrix_Width_B = (size_t)WB;
    void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
    // new CUDA 4.0 Driver API Kernel launch call
-        checkCudaErrors(cuLaunchKernel(matrixMul,
-                                       grid.x,
-                                       grid.y,
-                                       grid.z,
-                                       block.x,
-                                       block.y,
-                                       block.z,
-                                       2 * block_size * block_size * sizeof(float),
-                                       NULL,
-                                       args,
-                                       NULL));
-    }
-    else {
+    checkCudaErrors(cuLaunchKernel(
+        matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
+        2 * block_size * block_size * sizeof(float), NULL, args, NULL));
+  } else {
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
    // Launching (advanced method)
    int offset = 0;
@ -208,20 +198,14 @@ void runTest(int argc, char **argv)
    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
    offset += sizeof(Matrix_Width_B);

-        void *kernel_launch_config[5] = {
-            CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
+    void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
+                                     CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
+                                     CU_LAUNCH_PARAM_END};

    // new CUDA 4.0 Driver API Kernel launch call
-        checkCudaErrors(cuLaunchKernel(matrixMul,
-                                       grid.x,
-                                       grid.y,
-                                       grid.z,
-                                       block.x,
-                                       block.y,
-                                       block.z,
-                                       2 * block_size * block_size * sizeof(float),
-                                       NULL,
-                                       NULL,
+    checkCudaErrors(cuLaunchKernel(
+        matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
+        2 * block_size * block_size * sizeof(float), NULL, NULL,
        reinterpret_cast<void **>(&kernel_launch_config)));
  }

@ -238,7 +222,8 @@ void runTest(int argc, char **argv)

  for (int i = 0; i < static_cast<int>(WC * HC); i++) {
    if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
-            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB);
+      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i,
+             h_C[i], WA * valB);
      correct = false;
    }
  }
@ -259,15 +244,14 @@ void runTest(int argc, char **argv)
 }

 // Allocates a matrix with random float entries.
-void randomInit(float *data, int size)
-{
+void randomInit(float *data, int size) {
  for (int i = 0; i < size; ++i) {
    data[i] = rand() / static_cast<float>(RAND_MAX);
  }
 }

-static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size)
-{
+static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
+                    int *blk_size) {
  CUfunction cuFunction = 0;
  int major = 0, minor = 0;
  char deviceName[100];
@ -275,13 +259,16 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size
  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);

  // get compute capabilities and the devicename
-    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
-    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
  checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
  printf("> GPU Device has SM %d.%d compute capability\n", major, minor);

  checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
-    printf("  Total amount of global memory:     %llu bytes\n", (long long unsigned int)totalGlobalMem);
+  printf("  Total amount of global memory:     %llu bytes\n",
+         (long long unsigned int)totalGlobalMem);

  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));

@ -291,8 +278,7 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size

  if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
    exit(EXIT_FAILURE);
-    }
-    else {
+  } else {
    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
  }

@ -305,7 +291,8 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size
  checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));

  // select the suitable kernel function
-    const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"};
+  const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
+                           "matrixMul_bs8_64bit"};

  int idx = 0;
  int block_size = 32;
@ -315,12 +302,12 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size

    checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
    checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
-            &blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0));
+        &blocksPerGrid, &threadsPerBlock, cuFunction, 0,
+        2 * block_size * block_size * sizeof(float), 0));
    if (block_size * block_size <= threadsPerBlock) {
      printf("> %d block size selected\n", block_size);
      break;
-        }
-        else {
+    } else {
      block_size /= 2;
    }
    idx++;
--- a/Samples/0_Introduction/matrixMulDrv/matrixMul_kernel.cu
+++ b/Samples/0_Introduction/matrixMulDrv/matrixMul_kernel.cu
@ -42,8 +42,8 @@
 //! wA is A's width and wB is B's width
 ////////////////////////////////////////////////////////////////////////////////
 template <int block_size, typename size_type>
-__device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
-{
+__device__ void matrixMul(float *C, float *A, float *B, size_type wA,
+                          size_type wB) {
  // Block index
  size_type bx = blockIdx.x;
  size_type by = blockIdx.y;
@ -96,8 +96,7 @@ __device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type
    // of the block sub-matrix
 #pragma unroll

-        for (size_type k = 0; k < block_size; ++k)
-            Csub += AS(ty, k) * BS(k, tx);
+    for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);

    // Synchronize to make sure that the preceding
    // computation is done before loading two new
@ -112,16 +111,16 @@ __device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type
 }

 // C wrappers around our template kernel
-extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
-{
+extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B,
+                                               size_t wA, size_t wB) {
  matrixMul<8, size_t>(C, A, B, wA, wB);
 }
-extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
-{
+extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
+                                                size_t wA, size_t wB) {
  matrixMul<16, size_t>(C, A, B, wA, wB);
 }
-extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
-{
+extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
+                                                size_t wA, size_t wB) {
  matrixMul<32, size_t>(C, A, B, wA, wB);
 }

--- a/Samples/0_Introduction/matrixMulDynlinkJIT/CMakeLists.txt
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/README.md
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/README.md
@ -27,6 +27,6 @@ cuMemcpyDtoH, cuDeviceGetName, cuParamSeti, cuModuleLoadDataEx, cuModuleGetFunct

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c
@ -20,9 +20,8 @@
 //#define CUDA_INIT_D3D11
 //#define CUDA_INIT_OPENGL

-#include "cuda_drvapi_dynlink.h"
-
 #include <stdio.h>
+#include "cuda_drvapi_dynlink.h"

 tcuInit                               *_cuInit;
 tcuDriverGetVersion                   *cuDriverGetVersion;
@ -240,7 +239,8 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 {
    *pInstance = LoadLibrary(__CudaLibName);

-    if (*pInstance == NULL) {
+    if (*pInstance == NULL)
+    {
        printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
        return CUDA_ERROR_UNKNOWN;
    }
@ -251,21 +251,24 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 #define GET_PROC_EX(name, alias, required)                     \
    alias = (t##name *)GetProcAddress(CudaDrvLib, #name);               \
    if (alias == NULL && required) {                                    \
-        printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               #name, __CudaLibName);                                  \
        return CUDA_ERROR_UNKNOWN;                                      \
    }

 #define GET_PROC_EX_V2(name, alias, required)                           \
    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\
    if (alias == NULL && required) {                                    \
-        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               STRINGIFY(name##_v2), __CudaLibName);                       \
        return CUDA_ERROR_UNKNOWN;                                      \
    }

 #define GET_PROC_EX_V3(name, alias, required)                           \
    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));\
    if (alias == NULL && required) {                                    \
-        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               STRINGIFY(name##_v3), __CudaLibName);                       \
        return CUDA_ERROR_UNKNOWN;                                      \
    }

@ -291,7 +294,8 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 {
    *pInstance = dlopen(__CudaLibName, RTLD_NOW);

-    if (*pInstance == NULL) {
+    if (*pInstance == NULL)
+    {
        printf("dlopen \"%s\" failed!\n", __CudaLibName);
        return CUDA_ERROR_UNKNOWN;
    }
@ -302,21 +306,24 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 #define GET_PROC_EX(name, alias, required)                              \
    alias = (t##name *)dlsym(CudaDrvLib, #name);                        \
    if (alias == NULL && required) {                                    \
-        printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               #name, __CudaLibName);                                  \
        return CUDA_ERROR_UNKNOWN;                                      \
    }

 #define GET_PROC_EX_V2(name, alias, required)                           \
    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2));         \
    if (alias == NULL && required) {                                    \
-        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               STRINGIFY(name##_v2), __CudaLibName);                    \
        return CUDA_ERROR_UNKNOWN;                                      \
    }

 #define GET_PROC_EX_V3(name, alias, required)                           \
    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3));         \
    if (alias == NULL && required) {                                    \
-        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               STRINGIFY(name##_v3), __CudaLibName);                    \
        return CUDA_ERROR_UNKNOWN;                                      \
    }

@ -352,7 +359,8 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    // available since 2.2. if not present, version 1.0 is assumed
    GET_PROC_OPTIONAL(cuDriverGetVersion);

-    if (cuDriverGetVersion) {
+    if (cuDriverGetVersion)
+    {
        CHECKED_CALL(cuDriverGetVersion(&driverVer));
    }

@ -420,21 +428,24 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    GET_PROC(cuStreamDestroy);

    // These are CUDA 5.0 new functions
-    if (driverVer >= 5000) {
+    if (driverVer >= 5000)
+    {
        GET_PROC(cuMipmappedArrayCreate);
        GET_PROC(cuMipmappedArrayDestroy);
        GET_PROC(cuMipmappedArrayGetLevel);
    }

    // These are CUDA 4.2 new functions
-    if (driverVer >= 4020) {
+    if (driverVer >= 4020)
+    {
        GET_PROC(cuFuncSetSharedMemConfig);
        GET_PROC(cuCtxGetSharedMemConfig);
        GET_PROC(cuCtxSetSharedMemConfig);
    }

    // These are CUDA 4.1 new functions
-    if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
+    if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
+    {
        GET_PROC(cuDeviceGetByPCIBusId);
        GET_PROC(cuDeviceGetPCIBusId);
        GET_PROC(cuIpcGetEventHandle);
@ -445,7 +456,8 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    }

    // These could be _v2 interfaces
-    if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) {
+    if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000)
+    {
        GET_PROC_V2(cuCtxDestroy);
        GET_PROC_V2(cuCtxPopCurrent);
        GET_PROC_V2(cuCtxPushCurrent);
@ -453,7 +465,8 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC_V2(cuEventDestroy);
    }

-    if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
+    if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
+    {
        GET_PROC_V2(cuDeviceTotalMem);
        GET_PROC_V2(cuCtxCreate);
        GET_PROC_V2(cuModuleGetGlobal);
@ -494,14 +507,17 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC_V2(cuTexRefSetAddress);
        GET_PROC_V2(cuTexRefGetAddress);

-        if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
+        if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
+        {
            GET_PROC_V3(cuTexRefSetAddress2D);
        }
-        else {
+        else
+        {
            GET_PROC_V2(cuTexRefSetAddress2D);
        }
    }
-    else {
+    else
+    {
        // versions earlier than 3020
        GET_PROC(cuDeviceTotalMem);
        GET_PROC(cuCtxCreate);
@ -546,7 +562,8 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    }

    // The following functions are specific to CUDA versions
-    if (driverVer >= 4000) {
+    if (driverVer >= 4000)
+    {
        GET_PROC(cuCtxSetCurrent);
        GET_PROC(cuCtxGetCurrent);
        GET_PROC(cuMemHostRegister);
@ -557,7 +574,8 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuProfilerStop);
    }

-    if (driverVer >= 3010) {
+    if (driverVer >= 3010)
+    {
        GET_PROC(cuModuleGetSurfRef);
        GET_PROC(cuSurfRefSetArray);
        GET_PROC(cuSurfRefGetArray);
@ -565,7 +583,8 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuCtxGetLimit);
    }

-    if (driverVer >= 3000) {
+    if (driverVer >= 3000)
+    {
        GET_PROC(cuMemcpyDtoDAsync);
        GET_PROC(cuFuncSetCacheConfig);
 #ifdef CUDA_INIT_D3D11
@ -576,10 +595,12 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuGraphicsUnregisterResource);
        GET_PROC(cuGraphicsSubResourceGetMappedArray);

-        if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
+        if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
+        {
            GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
        }
-        else {
+        else
+        {
            GET_PROC(cuGraphicsResourceGetMappedPointer);
        }

@ -589,7 +610,8 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuGetExportTable);
    }

-    if (driverVer >= 2030) {
+    if (driverVer >= 2030)
+    {
        GET_PROC(cuMemHostGetFlags);
 #ifdef CUDA_INIT_D3D10
        GET_PROC(cuD3D10GetDevice);
@ -602,7 +624,8 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 #endif
    }

-    if (driverVer >= 2010) {
+    if (driverVer >= 2010)
+    {
        GET_PROC(cuModuleLoadDataEx);
        GET_PROC(cuModuleLoadFatBinary);
 #ifdef CUDA_INIT_OPENGL
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h
@ -43,8 +43,7 @@
 #define CUDA_VERSION 3020 /* 3.2 */

 #ifdef __cplusplus
-extern "C"
-{
+extern "C" {
 #endif

 /**
@ -82,7 +81,8 @@ extern "C"
 /**
 * Context creation flags
 */
-    typedef enum CUctx_flags_enum {
+typedef enum CUctx_flags_enum
+{
    CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
    CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
    CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
@ -103,7 +103,8 @@ extern "C"
 /**
 * Event creation flags
 */
-    typedef enum CUevent_flags_enum {
+typedef enum CUevent_flags_enum
+{
    CU_EVENT_DEFAULT        = 0, /**< Default event flag */
    CU_EVENT_BLOCKING_SYNC  = 1, /**< Event uses blocking synchronization */
    CU_EVENT_DISABLE_TIMING = 2  /**< Event will not record timing data */
@ -112,7 +113,8 @@ extern "C"
 /**
 * Array formats
 */
-    typedef enum CUarray_format_enum {
+typedef enum CUarray_format_enum
+{
    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
@ -126,7 +128,8 @@ extern "C"
 /**
 * Texture reference addressing modes
 */
-    typedef enum CUaddress_mode_enum {
+typedef enum CUaddress_mode_enum
+{
    CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
    CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
@ -136,7 +139,8 @@ extern "C"
 /**
 * Texture reference filtering modes
 */
-    typedef enum CUfilter_mode_enum {
+typedef enum CUfilter_mode_enum
+{
    CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
 } CUfilter_mode;
@ -144,7 +148,8 @@ extern "C"
 /**
 * Device properties
 */
-    typedef enum CUdevice_attribute_enum {
+typedef enum CUdevice_attribute_enum
+{
    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,              /**< Maximum number of threads per block */
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                    /**< Maximum block dimension X */
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                    /**< Maximum block dimension Y */
@ -153,15 +158,12 @@ extern "C"
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                     /**< Maximum grid dimension Y */
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                     /**< Maximum grid dimension Z */
    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,        /**< Maximum shared memory available per block in bytes */
-        CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK =
-            8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
-        CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY =
-            9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,            /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,              /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                         /**< Warp size in threads */
    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                         /**< Maximum pitch in bytes allowed by memory copies */
    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,           /**< Maximum number of 32-bit registers available per block */
-        CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK =
-            12,                                     /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,               /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                        /**< Peak clock frequency in kilohertz */
    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                 /**< Alignment requirement for textures */
    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                       /**< Device can possibly copy memory and execute a kernel concurrently */
@ -188,8 +190,7 @@ extern "C"
    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,          /**< Major compute capability version number */
    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76           /**< Minor compute capability version number */
 #if __CUDA_API_VERSION >= 4000
-        ,
-        CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE              = 36, /**< Peak memory clock frequency in kilohertz */
+                                     , CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                 /**< Peak memory clock frequency in kilohertz */
    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,           /**< Global memory bus width in bits */
    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                     /**< Size of L2 cache in bytes */
    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,    /**< Maximum resident threads per multiprocessor */
@ -220,7 +221,8 @@ extern "C"
 /**
 * Function properties
 */
-    typedef enum CUfunction_attribute_enum {
+typedef enum CUfunction_attribute_enum
+{
    /**
     * The maximum number of threads per block, beyond which a launch of the
     * function would fail. This number depends on both the function and the
@ -275,7 +277,8 @@ extern "C"
 /**
 * Function cache configurations
 */
-    typedef enum CUfunc_cache_enum {
+typedef enum CUfunc_cache_enum
+{
    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
    CU_FUNC_CACHE_PREFER_L1      = 0x02  /**< prefer larger L1 cache and smaller shared memory */
@ -284,7 +287,8 @@ extern "C"
 /**
 * Shared memory configurations
 */
-    typedef enum CUsharedconfig_enum {
+typedef enum CUsharedconfig_enum
+{
    CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
    CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
    CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
@ -293,34 +297,33 @@ extern "C"
 /**
 * Memory types
 */
-    typedef enum CUmemorytype_enum {
+typedef enum CUmemorytype_enum
+{
    CU_MEMORYTYPE_HOST    = 0x01,    /**< Host memory */
    CU_MEMORYTYPE_DEVICE  = 0x02,    /**< Device memory */
    CU_MEMORYTYPE_ARRAY   = 0x03     /**< Array memory */
 #if __CUDA_API_VERSION >= 4000
-        ,
-        CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
+                            , CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
 #endif
 } CUmemorytype;

 /**
 * Compute Modes
 */
-    typedef enum CUcomputemode_enum {
+typedef enum CUcomputemode_enum
+{
    CU_COMPUTEMODE_DEFAULT           = 0,  /**< Default compute mode (Multiple contexts allowed per device) */
-        CU_COMPUTEMODE_PROHIBITED =
-            2 /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
+    CU_COMPUTEMODE_PROHIBITED        = 2  /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
 #if __CUDA_API_VERSION >= 4000
-        ,
-        CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single
-                                                process can be present on this device at a time) */
+                                       , CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
 #endif
 } CUcomputemode;

 /**
 * Online compiler options
 */
-    typedef enum CUjit_option_enum {
+typedef enum CUjit_option_enum
+{
    /**
     * Max number of registers that a thread may use.\n
     * Option type: unsigned int
@ -411,7 +414,8 @@ extern "C"
 /**
 * Online compilation targets
 */
-    typedef enum CUjit_target_enum {
+typedef enum CUjit_target_enum
+{
    CU_TARGET_COMPUTE_20 = 20,       /**< Compute device class 2.0 */
    CU_TARGET_COMPUTE_21 = 21,       /**< Compute device class 2.1 */
    CU_TARGET_COMPUTE_30 = 30,       /**< Compute device class 3.0 */
@ -430,7 +434,8 @@ extern "C"
 /**
 * Cubin matching fallback strategies
 */
-    typedef enum CUjit_fallback_enum {
+typedef enum CUjit_fallback_enum
+{
    CU_PREFER_PTX = 0,  /**< Prefer to compile ptx */
    CU_PREFER_BINARY    /**< Prefer to fall back to compatible binary code */
 } CUjit_fallback;
@ -438,7 +443,8 @@ extern "C"
 /**
 * Flags to register a graphics resource
 */
-    typedef enum CUgraphicsRegisterFlags_enum {
+typedef enum CUgraphicsRegisterFlags_enum
+{
    CU_GRAPHICS_REGISTER_FLAGS_NONE          = 0x00,
    CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY     = 0x01,
    CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
@ -448,7 +454,8 @@ extern "C"
 /**
 * Flags for mapping and unmapping interop resources
 */
-    typedef enum CUgraphicsMapResourceFlags_enum {
+typedef enum CUgraphicsMapResourceFlags_enum
+{
    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
@ -457,7 +464,8 @@ extern "C"
 /**
 * Array indices for cube faces
 */
-    typedef enum CUarray_cubemap_face_enum {
+typedef enum CUarray_cubemap_face_enum
+{
    CU_CUBEMAP_FACE_POSITIVE_X  = 0x00, /**< Positive X face of cubemap */
    CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01, /**< Negative X face of cubemap */
    CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02, /**< Positive Y face of cubemap */
@ -469,7 +477,8 @@ extern "C"
 /**
 * Limits
 */
-    typedef enum CUlimit_enum {
+typedef enum CUlimit_enum
+{
    CU_LIMIT_STACK_SIZE        = 0x00, /**< GPU thread stack size */
    CU_LIMIT_PRINTF_FIFO_SIZE  = 0x01, /**< GPU printf FIFO size */
    CU_LIMIT_MALLOC_HEAP_SIZE  = 0x02  /**< GPU malloc heap size */
@ -478,7 +487,8 @@ extern "C"
 /**
 * Resource types
 */
-    typedef enum CUresourcetype_enum {
+typedef enum CUresourcetype_enum
+{
    CU_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resoure */
    CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
    CU_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
@ -488,7 +498,8 @@ extern "C"
 /**
 * Error codes
 */
-    typedef enum cudaError_enum {
+typedef enum cudaError_enum
+{
    /**
     * The API call returned with no errors. In the case of query calls, this
     * can also mean that the operation being queried is complete (see
@ -1053,7 +1064,8 @@ extern "C"
 /**
 * Resource view format
 */
-    typedef enum CUresourceViewFormat_enum {
+typedef enum CUresourceViewFormat_enum
+{
    CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
@ -1118,6 +1130,7 @@ extern "C"
 #endif


+
 /**
 * If set, the CUDA array is a collection of layers, where each layer is either a 1D
 * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
@ -1407,11 +1420,7 @@ typedef CUresult CUDAAPI tcuDeviceTotalMem(unsigned int *bytes, CUdevice dev);

 typedef CUresult  CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
 typedef CUresult  CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
-    typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule     *module,
-                                                 const void   *image,
-                                                 unsigned int  numOptions,
-                                                 CUjit_option *options,
-                                                 void        **optionValues);
+typedef CUresult  CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
 typedef CUresult  CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
 typedef CUresult  CUDAAPI tcuModuleUnload(CUmodule hmod);
 typedef CUresult  CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
@ -1440,7 +1449,8 @@ typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *byt
                                          size_t Height,
                                          // size of biggest r/w to be performed by kernels on this memory
                                          // 4, 8 or 16 bytes
-                                              unsigned int ElementSizeBytes);
+                                          unsigned int ElementSizeBytes
+                                         );
 #else
 typedef CUresult CUDAAPI tcuMemGetInfo(unsigned int *free, unsigned int *total);
 typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
@ -1451,7 +1461,8 @@ typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr  *dptr,
                                          unsigned int Height,
                                          // size of biggest r/w to be performed by kernels on this memory
                                          // 4, 8 or 16 bytes
-                                          unsigned int ElementSizeBytes);
+                                          unsigned int ElementSizeBytes
+                                         );
 #endif

 typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr);
@ -1484,9 +1495,9 @@ typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize);
    char reserved[CU_IPC_HANDLE_SIZE];
 } CUipcMemHandle;

-    typedef enum CUipcMem_flags_enum {
-        CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS =
-            0x1 /**< Automatically enable peer access between remote devices as needed */
+typedef enum CUipcMem_flags_enum
+{
+    CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
 } CUipcMem_flags;

 typedef CUresult CUDAAPI tcuDeviceGetByPCIBusId(CUdevice *dev, char *pciBusId);
@ -1499,14 +1510,9 @@ typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize);
 #endif

 typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
-    typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);
-    ;
+typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);;
 typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
-    typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice,
-                                           CUcontext   dstContext,
-                                           CUdeviceptr srcDevice,
-                                           CUcontext   srcContext,
-                                           size_t      ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);

 /************************************
 **
@ -1535,8 +1541,7 @@ typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize);
 typedef CUresult  CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);

 // array <-> array memory
-    typedef CUresult CUDAAPI
-    tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult  CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
 #else
 // system <-> device memory
 typedef CUresult  CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
@ -1546,28 +1551,15 @@ typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, uns
 typedef CUresult  CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);

 // device <-> array memory
-typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray      dstArray,
-                                       unsigned int dstOffset,
-                                       CUdeviceptr  srcDevice,
-                                       unsigned int ByteCount);
-typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr  dstDevice,
-                                       CUarray      srcArray,
-                                       unsigned int srcOffset,
-                                       unsigned int ByteCount);
+typedef CUresult  CUDAAPI tcuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
+typedef CUresult  CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);

 // system <-> array memory
-typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray      dstArray,
-                                       unsigned int dstOffset,
-                                       const void  *srcHost,
-                                       unsigned int ByteCount);
+typedef CUresult  CUDAAPI tcuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
 typedef CUresult  CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);

 // array <-> array memory
-typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray      dstArray,
-                                       unsigned int dstOffset,
-                                       CUarray      srcArray,
-                                       unsigned int srcOffset,
-                                       unsigned int ByteCount);
+typedef CUresult  CUDAAPI tcuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
 #endif

 // 2D memcpy
@ -1594,51 +1586,36 @@ typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray      dstArray,
 #if __CUDA_API_VERSION >= 3020
 // system <-> device memory
 typedef CUresult  CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
-                                                const void *srcHost,
-                                                size_t      ByteCount,
-                                                CUstream    hStream);
+                                             const void *srcHost, size_t ByteCount, CUstream hStream);
 typedef CUresult  CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
-                                                CUdeviceptr srcDevice,
-                                                size_t      ByteCount,
-                                                CUstream    hStream);
+                                             CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);

 // device <-> device memory
 typedef CUresult  CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
-                                                CUdeviceptr srcDevice,
-                                                size_t      ByteCount,
-                                                CUstream    hStream);
+                                             CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);

 // system <-> array memory
-    typedef CUresult CUDAAPI
-    tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
-    typedef CUresult CUDAAPI
-    tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+typedef CUresult  CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
+                                             const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult  CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset,
+                                             size_t ByteCount, CUstream hStream);

 #else
 // system <-> device memory
 typedef CUresult  CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
-                                            const void  *srcHost,
-                                            unsigned int ByteCount,
-                                            CUstream     hStream);
+                                             const void *srcHost, unsigned int ByteCount, CUstream hStream);
 typedef CUresult  CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
-                                            CUdeviceptr  srcDevice,
-                                            unsigned int ByteCount,
-                                            CUstream     hStream);
+                                             CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);

 // device <-> device memory
 typedef CUresult  CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
-                                            CUdeviceptr  srcDevice,
-                                            unsigned int ByteCount,
-                                            CUstream     hStream);
+                                             CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);

 // system <-> array memory
-typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray      dstArray,
-                                            unsigned int dstOffset,
-                                            const void  *srcHost,
-                                            unsigned int ByteCount,
-                                            CUstream     hStream);
-typedef CUresult CUDAAPI
-tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
+typedef CUresult  CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset,
+                                             const void *srcHost, unsigned int ByteCount, CUstream hStream);
+typedef CUresult  CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset,
+                                             unsigned int ByteCount, CUstream hStream);
 #endif

 // 2D memcpy
@ -1657,22 +1634,13 @@ tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsi
 typedef CUresult  CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);

 #if __CUDA_API_VERSION >= 3020
-    typedef CUresult CUDAAPI
-    tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
-    typedef CUresult CUDAAPI
-    tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
-    typedef CUresult CUDAAPI
-    tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult  CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult  CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult  CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
 #else
-typedef CUresult CUDAAPI
-tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
-typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr    dstDevice,
-                                        unsigned int   dstPitch,
-                                        unsigned short us,
-                                        unsigned int   Width,
-                                        unsigned int   Height);
-typedef CUresult CUDAAPI
-tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+typedef CUresult  CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+typedef CUresult  CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+typedef CUresult  CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
 #endif

 /************************************
@ -1689,16 +1657,10 @@ tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, un
 typedef CUresult CUDAAPI tcuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);

 typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f,
-                                             unsigned int gridDimX,
-                                             unsigned int gridDimY,
-                                             unsigned int gridDimZ,
-                                             unsigned int blockDimX,
-                                             unsigned int blockDimY,
-                                             unsigned int blockDimZ,
+                                         unsigned int gridDimX,  unsigned int gridDimY,  unsigned int gridDimZ,
+                                         unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
                                         unsigned int sharedMemBytes,
-                                             CUstream     hStream,
-                                             void       **kernelParams,
-                                             void       **extra);
+                                         CUstream hStream, void **kernelParams, void **extra);

 /************************************
 **
@ -1714,12 +1676,8 @@ tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, un
 typedef CUresult  CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);

 #if __CUDA_API_VERSION >= 5000
-    typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray              *pHandle,
-                                                     const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
-                                                     unsigned int                   numMipmapLevels);
-    typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray         *pLevelArray,
-                                                       CUmipmappedArray hMipmappedArray,
-                                                       unsigned int     level);
+typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
 typedef CUresult CUDAAPI tcuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
 #endif

@ -1736,19 +1694,10 @@ tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, un

 #if __CUDA_API_VERSION >= 3020
 typedef CUresult  CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
-    typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref                     hTexRef,
-                                                   const CUDA_ARRAY_DESCRIPTOR *desc,
-                                                   CUdeviceptr                  dptr,
-                                                   size_t                       Pitch);
+typedef CUresult  CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
 #else
-typedef CUresult CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset,
-                                             CUtexref      hTexRef,
-                                             CUdeviceptr   dptr,
-                                             unsigned int  bytes);
-typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref                     hTexRef,
-                                               const CUDA_ARRAY_DESCRIPTOR *desc,
-                                               CUdeviceptr                  dptr,
-                                               unsigned int                 Pitch);
+typedef CUresult  CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
+typedef CUresult  CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
 #endif

 typedef CUresult  CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
@ -1814,10 +1763,7 @@ typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref                     hTex
 ***********************************/
 typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags);
 typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
-    typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream         hStream,
-                                                  CUstreamCallback callback,
-                                                  void            *userData,
-                                                  unsigned int     flags);
+typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);

 typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
 typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream);
@ -1829,28 +1775,17 @@ typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref                     hTex
 **
 ***********************************/
 typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
-    typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray           *pArray,
-                                                                  CUgraphicsResource resource,
-                                                                  unsigned int       arrayIndex,
-                                                                  unsigned int       mipLevel);
+typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);

 #if __CUDA_API_VERSION >= 3020
-    typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr       *pDevPtr,
-                                                                 size_t            *pSize,
-                                                                 CUgraphicsResource resource);
+typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
 #else
-typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr       *pDevPtr,
-                                                             unsigned int      *pSize,
-                                                             CUgraphicsResource resource);
+typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
 #endif

 typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
-    typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int        count,
-                                                     CUgraphicsResource *resources,
-                                                     CUstream            hStream);
-    typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int        count,
-                                                       CUgraphicsResource *resources,
-                                                       CUstream            hStream);
+typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);

 /************************************
 **
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h
@ -14,17 +14,21 @@
 #ifndef HELPER_CUDA_DRVAPI_H
 #define HELPER_CUDA_DRVAPI_H

-#include <helper_string.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

+#include <helper_string.h>
+
 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
 #endif

 #ifndef HELPER_CUDA_DRVAPI_H
-inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
+inline int ftoi(float value) {
+  return (value >= 0 ? static_cast<int>(value + 0.5)
+                     : static_cast<int>(value - 0.5));
+}
 #endif

 #ifndef EXIT_WAIVED
@ -43,43 +47,39 @@ inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5
 #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)

 // These are the inline versions for all of the SDK helper functions
-inline void __checkCudaErrors(CUresult err, const char *file, const int line)
-{
+inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
  if (CUDA_SUCCESS != err) {
    const char *errorStr = NULL;
    cuGetErrorString(err, &errorStr);
    fprintf(stderr,
            "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
            "line %i.\n",
-                err,
-                errorStr,
-                file,
-                line);
+            err, errorStr, file, line);
    exit(EXIT_FAILURE);
  }
 }
 #endif

 // This function wraps the CUDA Driver API into a template function
-template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
-{
+template <class T>
+inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
+                             int device) {
  checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
 }
 #endif

 // Beginning of GPU Architecture definitions
-inline int _ConvertSMVer2CoresDRV(int major, int minor)
-{
+inline int _ConvertSMVer2CoresDRV(int major, int minor) {
  // Defines for GPU Architecture types (using the SM version to determine the #
  // of cores per SM
-    typedef struct
-    {
+  typedef struct {
    int SM;  // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
             // minor version
    int Cores;
  } sSMtoCores;

-    sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192},
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
      {0x32, 192},
      {0x35, 192},
      {0x37, 192},
@ -110,18 +110,16 @@ inline int _ConvertSMVer2CoresDRV(int major, int minor)

  // If we don't find the values, we default use the previous one to run
  // properly
-    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
-           major,
-           minor,
-           nGpuArchCoresPerSM[index - 1].Cores);
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
  return nGpuArchCoresPerSM[index - 1].Cores;
 }
  // end of GPU Architecture definitions

 #ifdef __cuda_cuda_h__
 // General GPU Device CUDA Initialization
-inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
-{
+inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
  int cuDevice = 0;
  int deviceCount = 0;
  checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
@ -142,8 +140,11 @@ inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)

  if (dev > deviceCount - 1) {
    fprintf(stderr, "\n");
-        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
-        fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            deviceCount);
+    fprintf(stderr,
+            ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n",
+            dev);
    fprintf(stderr, "\n");
    return -dev;
  }
@ -170,8 +171,7 @@ inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
 }

 // This function returns the best GPU based on performance
-inline int gpuGetMaxGflopsDeviceIdDRV()
-{
+inline int gpuGetMaxGflopsDeviceIdDRV() {
  CUdevice current_device = 0;
  CUdevice max_perf_device = 0;
  int device_count = 0;
@ -187,7 +187,8 @@ inline int gpuGetMaxGflopsDeviceIdDRV()
  checkCudaErrors(cuDeviceGetCount(&device_count));

  if (device_count == 0) {
-        fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
    exit(EXIT_FAILURE);
  }

@ -195,31 +196,36 @@ inline int gpuGetMaxGflopsDeviceIdDRV()
  current_device = 0;

  while (current_device < device_count) {
-        checkCudaErrors(
-            cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device));
-        checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
-        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
-        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
+    checkCudaErrors(cuDeviceGetAttribute(
+        &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+        current_device));
+    checkCudaErrors(cuDeviceGetAttribute(
+        &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
+    checkCudaErrors(cuDeviceGetAttribute(
+        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
+    checkCudaErrors(cuDeviceGetAttribute(
+        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));

    int computeMode;
-        getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
+    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
+                          current_device);

    if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
      if (major == 9999 && minor == 9999) {
        sm_per_multiproc = 1;
-            }
-            else {
+      } else {
        sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
      }

-            unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate);
+      unsigned long long compute_perf =
+          (unsigned long long)(multiProcessorCount * sm_per_multiproc *
+                               clockRate);

      if (compute_perf > max_compute_perf) {
          max_compute_perf = compute_perf;
          max_perf_device = current_device;
      }
-        }
-        else {
+    } else {
      devices_prohibited++;
    }

@ -237,8 +243,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV()
 }

 // General initialization call to pick the best CUDA Device
-inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
-{
+inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
  CUdevice cuDevice;
  int devID = 0;

@ -250,8 +255,7 @@ inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
      printf("exiting...\n");
      exit(EXIT_SUCCESS);
    }
-    }
-    else {
+  } else {
    // Otherwise pick the device with highest Gflops/s
    char name[100];
    devID = gpuGetMaxGflopsDeviceIdDRV();
@ -265,8 +269,7 @@ inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
  return cuDevice;
 }

-inline CUdevice findIntegratedGPUDrv()
-{
+inline CUdevice findIntegratedGPUDrv() {
  CUdevice current_device = 0;
  int device_count = 0;
  int devices_prohibited = 0;
@ -283,22 +286,28 @@ inline CUdevice findIntegratedGPUDrv()
  // Find the integrated GPU which is compute capable
  while (current_device < device_count) {
    int computeMode = -1;
-        checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
-        checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
+    checkCudaErrors(cuDeviceGetAttribute(
+        &isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
+    checkCudaErrors(cuDeviceGetAttribute(
+        &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));

    // If GPU is integrated and is not running on Compute Mode prohibited use
    // that
    if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
      int major = 0, minor = 0;
      char deviceName[256];
-            checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
-            checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
+      checkCudaErrors(cuDeviceGetAttribute(
+          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+          current_device));
+      checkCudaErrors(cuDeviceGetAttribute(
+          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+          current_device));
      checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
-            printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor);
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, deviceName, major, minor);

      return current_device;
-        }
-        else {
+    } else {
      devices_prohibited++;
    }

@ -314,26 +323,29 @@ inline CUdevice findIntegratedGPUDrv()
 }

 // General check for CUDA GPU SM Capabilities
-inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
-{
+inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
+                                     int devID) {
  CUdevice cuDevice;
  char name[256];
  int major = 0, minor = 0;

  checkCudaErrors(cuDeviceGet(&cuDevice, devID));
  checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
-    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
-    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));

-    if ((major > major_version) || (major == major_version && minor >= minor_version)) {
-        printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
+  if ((major > major_version) ||
+      (major == major_version && minor >= minor_version)) {
+    printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name,
+           major, minor);
    return true;
-    }
-    else {
-        printf("No GPU device was found that can support CUDA compute capability "
+  } else {
+    printf(
+        "No GPU device was found that can support CUDA compute capability "
        "%d.%d.\n",
-               major_version,
-               minor_version);
+        major_version, minor_version);
    return false;
  }
 }
@ -342,3 +354,4 @@ inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int d
  // end of CUDA Helper Functions

 #endif  // HELPER_CUDA_DRVAPI_H
+
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMulDynlinkJIT.cpp
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMulDynlinkJIT.cpp
@ -43,10 +43,10 @@
 */

 // includes, system
-#include <math.h>
-#include <stdio.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <string.h>
+#include <math.h>

 // includes, CUDA
 #include "cuda_drvapi_dynlink.h"
@ -78,7 +78,8 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
 ////////////////////////////////////////////////////////////////////////////////
 void randomInit(float *data, size_t size)
 {
-    for (size_t i = 0; i < size; ++i) {
+    for (size_t i = 0; i < size; ++i)
+    {
        data[i] = rand() / (float)RAND_MAX;
    }
 }
@ -99,14 +100,18 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
    checkCudaErrors(cuInit(0, __CUDA_API_VERSION));

    // This assumes that the user is attempting to specify a explicit device -device=n
-    if (argc > 1) {
+    if (argc > 1)
+    {
        bool bFound = false;

-        for (int param = 0; param < argc; param++) {
-            if (!strncmp(argv[param], "-device", 7)) {
+        for (int param=0; param < argc; param++)
+        {
+            if (!strncmp(argv[param], "-device", 7))
+            {
                int i=(int)strlen(argv[1]);

-                while (argv[1][i] != '=') {
+                while (argv[1][i] != '=')
+                {
                    i--;
                }

@ -123,15 +128,16 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
    int deviceCount = 0;
    checkCudaErrors(cuDeviceGetCount(&deviceCount));

-    if (deviceCount == 0) {
+    if (deviceCount == 0)
+    {
        fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
        exit(EXIT_SUCCESS);
    }

-    if (devID < 0)
-        devID = 0;
+    if (devID < 0) devID = 0;

-    if (devID > deviceCount - 1) {
+    if (devID > deviceCount -1)
+    {
        fprintf(stderr, "initCUDA (Device=%d) invalid GPU device.  %d GPU device(s) detected.\n\n", devID, deviceCount);
        status = CUDA_ERROR_NOT_FOUND;

@ -153,7 +159,8 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
    // create context for picked device
    status = cuCtxCreate(&g_cuContext, 0, cuDevice);

-    if (CUDA_SUCCESS != status) {
+    if (CUDA_SUCCESS != status)
+    {
        cuCtxDestroy(g_cuContext);
        exit(EXIT_SUCCESS);
    }
@ -184,11 +191,9 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
        printf("> Compiling CUDA module\n");

 #if defined(_WIN64) || defined(__LP64__)
-        status =
-            cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
+        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
 #else
-        status =
-            cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
+        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
 #endif

        printf("> PTX JIT log:\n%s\n", jitLogBuffer);
@ -198,17 +203,19 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
        delete [] jitLogBuffer;
    }

-    if (CUDA_SUCCESS != status) {
+    if (CUDA_SUCCESS != status)
+    {
        printf("Error while compiling PTX\n");
        cuCtxDestroy(g_cuContext);
        exit(EXIT_FAILURE);
    }

    // retrieve CUDA function from the compiled module
-    status = cuModuleGetFunction(
-        &cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
+    status = cuModuleGetFunction(&cuFunction, cuModule,
+                                 (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");

-    if (CUDA_SUCCESS != status) {
+    if (CUDA_SUCCESS != status)
+    {
        cuCtxDestroy(g_cuContext);
        exit(EXIT_FAILURE);
    }
@ -273,8 +280,10 @@ int main(int argc, char **argv)
        int Matrix_Width_B = WB;
        void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B };

-        checkCudaErrors(cuLaunchKernel(
-            matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
+        checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1,
+                                       block_size     , block_size     , 1,
+                                       0,
+                                       NULL, args, NULL));
    }
 #else // __CUDA_API_VERSION <= 3020
    {
@ -322,7 +331,8 @@ int main(int argc, char **argv)
    // check result
    float diff=0.0f;

-    for (unsigned int i = 0; i < size_C; i++) {
+    for (unsigned int i=0; i<size_C; i++)
+    {
        float tmp = reference[i] - h_C[i];
        diff += tmp*tmp;
    }
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_gold.cpp
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_gold.cpp
@ -28,7 +28,8 @@

 ////////////////////////////////////////////////////////////////////////////////
 // export C interface
-extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
+extern "C"
+void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);

 ////////////////////////////////////////////////////////////////////////////////
 //! Compute reference data set
@ -39,13 +40,16 @@ extern "C" void computeGold(float *, const float *, const float *, unsigned int,
 //! @param hA         height of matrix A
 //! @param wB         width of matrix B
 ////////////////////////////////////////////////////////////////////////////////
-void computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
+void
+computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
 {
    for (unsigned int i = 0; i < hA; ++i)
-        for (unsigned int j = 0; j < wB; ++j) {
+        for (unsigned int j = 0; j < wB; ++j)
+        {
            double sum = 0;

-            for (unsigned int k = 0; k < wA; ++k) {
+            for (unsigned int k = 0; k < wA; ++k)
+            {
                double a = A[i * wA + k];
                double b = B[k * wB + j];
                sum += a * b;
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.c
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.c
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.h
@ -32,8 +32,7 @@
 #define __matrixMul_kernel_32_ptxdump_h__

 #if defined __cplusplus
-extern "C"
-{
+extern "C" {
 #endif

    extern unsigned char matrixMul_kernel_32_ptxdump[25784];
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.c
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.c
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.h
@ -32,8 +32,7 @@
 #define __matrixMul_kernel_64_ptxdump_h__

 #if defined __cplusplus
-extern "C"
-{
+extern "C" {
 #endif

    extern unsigned char matrixMul_kernel_64_ptxdump[26489];
--- a/Samples/0_Introduction/matrixMul_nvrtc/CMakeLists.txt
+++ b/Samples/0_Introduction/matrixMul_nvrtc/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/matrixMul_nvrtc/README.md
+++ b/Samples/0_Introduction/matrixMul_nvrtc/README.md
@ -2,7 +2,7 @@

 ## Description

-This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication.  To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
+This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication.  To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.

 ## Key Concepts

@ -30,7 +30,7 @@ cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuCtxSynchronize, cuMemAlloc, cuMemF

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
--- a/Samples/0_Introduction/matrixMul_nvrtc/matrixMul.cpp
+++ b/Samples/0_Introduction/matrixMul_nvrtc/matrixMul.cpp
@ -42,19 +42,17 @@
 */

 // System includes
-#include <assert.h>
 #include <stdio.h>
+#include <assert.h>

 // CUDA runtime
 #include <cuda_runtime.h>
-
 #include "nvrtc_helper.h"

 // Helper functions and utilities to work with CUDA
 #include <helper_functions.h>

-void constantInit(float *data, int size, float val)
-{
+void constantInit(float *data, int size, float val) {
  for (int i = 0; i < size; ++i) {
    data[i] = val;
  }
@ -63,8 +61,8 @@ void constantInit(float *data, int size, float val)
 /**
 * Run a simple test of matrix multiplication using CUDA
 */
-int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
-{
+int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
+                   dim3 &dimsB) {
  // Allocate host memory for matrices A and B
  unsigned int size_A = dimsA.x * dimsA.y;
  unsigned int mem_size_A = sizeof(float) * size_A;
@ -116,27 +114,24 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim

  CUfunction kernel_addr;
  if (block_size == 16) {
-        checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
-    }
-    else {
-        checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
+    checkCudaErrors(
+        cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
+  } else {
+    checkCudaErrors(
+        cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
  }

-    void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x};
+  void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
+                 (void *)&dimsB.x};

  // Execute the kernel
  int nIter = 300;

  for (int j = 0; j < nIter; j++) {
-        checkCudaErrors(cuLaunchKernel(kernel_addr,
-                                       grid.x,
-                                       grid.y,
-                                       grid.z, /* grid dim */
-                                       threads.x,
-                                       threads.y,
-                                       threads.z, /* block dim */
-                                       0,
-                                       0,       /* shared mem, stream */
+    checkCudaErrors(
+        cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
+                       threads.x, threads.y, threads.z,     /* block dim */
+                       0, 0,    /* shared mem, stream */
                       &arr[0], /* arguments */
                       0));

@ -162,14 +157,16 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
    double rel_err = abs_err / abs_val / dot_length;

    if (rel_err > eps) {
-            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
+      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
+             h_C[i], dimsA.x * valB, eps);
      correct = false;
    }
  }

  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

-    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
+  printf(
+      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
      "Results may vary when GPU Boost is enabled.\n");

  // Clean up memory
@ -183,8 +180,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim

  if (correct) {
    return EXIT_SUCCESS;
-    }
-    else {
+  } else {
    return EXIT_FAILURE;
  }
 }
@ -193,15 +189,16 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
 * Program main
 */

-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  printf("[Matrix Multiply Using CUDA] - Starting...\n");

-    if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
+  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
+      checkCmdLineFlag(argc, (const char **)argv, "?")) {
    printf("Usage -device=n (n >= 0 for deviceID)\n");
    printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
    printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
-        printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
+    printf(
+        "  Note: Outer matrix dimensions of A & B matrices must be equal.\n");

    exit(EXIT_SUCCESS);
  }
@ -237,11 +234,13 @@ int main(int argc, char **argv)
  }

  if (dimsA.x != dimsB.y) {
-        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
+    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
+           dimsA.x, dimsB.y);
    exit(EXIT_FAILURE);
  }

-    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
+  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
+         dimsB.y);

  int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);

--- a/Samples/0_Introduction/matrixMul_nvrtc/matrixMul_kernel.cu
+++ b/Samples/0_Introduction/matrixMul_nvrtc/matrixMul_kernel.cu
@ -48,10 +48,11 @@

 #include <cooperative_groups.h>

-template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
-{
+template <int BLOCK_SIZE>
+__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
  // Handle to thread block group
-    cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
+  cooperative_groups::thread_block cta =
+      cooperative_groups::this_thread_block();
  // Block index
  int bx = blockIdx.x;
  int by = blockIdx.y;
@ -119,12 +120,12 @@ template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, floa
  C[c + wB * ty + tx] = Csub;
 }

-extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
-{
+extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
+                                                 int wA, int wB) {
  matrixMulCUDA<16>(C, A, B, wA, wB);
 }

-extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
-{
+extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
+                                                 int wA, int wB) {
  matrixMulCUDA<32>(C, A, B, wA, wB);
 }
--- a/Samples/0_Introduction/mergeSort/CMakeLists.txt
+++ b/Samples/0_Introduction/mergeSort/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/mergeSort/README.md
+++ b/Samples/0_Introduction/mergeSort/README.md
@ -27,6 +27,6 @@ cudaMalloc, cudaDeviceSynchronize, cudaMemcpy, cudaFree

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/mergeSort/bitonic.cu
+++ b/Samples/0_Introduction/mergeSort/bitonic.cu
@ -28,13 +28,12 @@
 #include <cooperative_groups.h>

 namespace cg = cooperative_groups;
-#include <assert.h>
 #include <helper_cuda.h>
-
+#include <assert.h>
 #include "mergeSort_common.h"

-inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir)
-{
+inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
+                                  uint &valB, uint arrowDir) {
  uint t;

  if ((keyA > keyB) == arrowDir) {
@ -47,9 +46,9 @@ inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB
  }
 }

-__global__ void
-bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir)
-{
+__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
+                                        uint *d_SrcKey, uint *d_SrcVal,
+                                        uint arrayLength, uint sortDir) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  // Shared memory storage for one or more short vectors
@ -63,8 +62,10 @@ bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_
  d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
  s_key[threadIdx.x + 0] = d_SrcKey[0];
  s_val[threadIdx.x + 0] = d_SrcVal[0];
-    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
-    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
+  s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
+      d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
+  s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
+      d_SrcVal[(SHARED_SIZE_LIMIT / 2)];

  for (uint size = 2; size < arrayLength; size <<= 1) {
    // Bitonic merge
@ -73,7 +74,8 @@ bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_
    for (uint stride = size / 2; stride > 0; stride >>= 1) {
      cg::sync(cta);
      uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-            Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir);
+      Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
+                 s_val[pos + stride], dir);
    }
  }

@ -82,25 +84,26 @@ bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_
    for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
      cg::sync(cta);
      uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-            Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir);
+      Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
+                 s_val[pos + stride], sortDir);
    }
  }

  cg::sync(cta);
  d_DstKey[0] = s_key[threadIdx.x + 0];
  d_DstVal[0] = s_val[threadIdx.x + 0];
-    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
+      s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
+      s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 }

 // Helper function (also used by odd-even merge sort)
-extern "C" uint factorRadix2(uint *log2L, uint L)
-{
+extern "C" uint factorRadix2(uint *log2L, uint L) {
  if (!L) {
    *log2L = 0;
    return 0;
-    }
-    else {
+  } else {
    for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
      ;

@ -108,14 +111,10 @@ extern "C" uint factorRadix2(uint *log2L, uint L)
  }
 }

-extern "C" void bitonicSortShared(uint *d_DstKey,
-                                  uint *d_DstVal,
-                                  uint *d_SrcKey,
-                                  uint *d_SrcVal,
-                                  uint  batchSize,
-                                  uint  arrayLength,
-                                  uint  sortDir)
-{
+extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
+                                  uint *d_SrcKey, uint *d_SrcVal,
+                                  uint batchSize, uint arrayLength,
+                                  uint sortDir) {
  // Nothing to sort
  if (arrayLength < 2) {
    return;
@ -132,25 +131,32 @@ extern "C" void bitonicSortShared(uint *d_DstKey,
  assert(arrayLength <= SHARED_SIZE_LIMIT);
  assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);

-    bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
+  bitonicSortSharedKernel<<<blockCount, threadCount>>>(
+      d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
  getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 3: merge elementary intervals
 ////////////////////////////////////////////////////////////////////////////////
-static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
+static inline __host__ __device__ uint iDivUp(uint a, uint b) {
+  return ((a % b) == 0) ? (a / b) : (a / b + 1);
+}

-static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
+static inline __host__ __device__ uint getSampleCount(uint dividend) {
+  return iDivUp(dividend, SAMPLE_STRIDE);
+}

 template <uint sortDir>
-static inline __device__ void
-ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir)
-{
+static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
+                                                 uint &flagA, uint &keyB,
+                                                 uint &valB, uint &flagB,
+                                                 uint arrowDir) {
  uint t;

-    if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1))
-        || ((arrowDir != sortDir) && (flagB == 1))) {
+  if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) ||
+      ((arrowDir == sortDir) && (flagA == 1)) ||
+      ((arrowDir != sortDir) && (flagB == 1))) {
    t = keyA;
    keyA = keyB;
    keyB = t;
@ -164,15 +170,9 @@ ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB,
 }

 template <uint sortDir>
-__global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
-                                                      uint *d_DstVal,
-                                                      uint *d_SrcKey,
-                                                      uint *d_SrcVal,
-                                                      uint *d_LimitsA,
-                                                      uint *d_LimitsB,
-                                                      uint  stride,
-                                                      uint  N)
-{
+__global__ void bitonicMergeElementaryIntervalsKernel(
+    uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal,
+    uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  __shared__ uint s_key[2 * SAMPLE_STRIDE];
@ -200,8 +200,10 @@ __global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
    startSrcB = d_LimitsB[blockIdx.x];
    startDst = startSrcA + startSrcB;

-        uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
-        uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
+    uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
+                                                    : segmentElementsA;
+    uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
+                                                    : segmentElementsB;
    lenSrcA = endSrcA - startSrcA;
    lenSrcB = endSrcB - startSrcB;
  }
@ -220,8 +222,10 @@ __global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,

  // Prepare for bitonic merge by inversing the ordering
  if (threadIdx.x < lenSrcB) {
-        s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
-        s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
+    s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
+        d_SrcKey[stride + startSrcB + threadIdx.x];
+    s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
+        d_SrcVal[stride + startSrcB + threadIdx.x];
    s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
  }

@ -229,13 +233,9 @@ __global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
  for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
    cg::sync(cta);
    uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-        ComparatorExtended<sortDir>(s_key[pos + 0],
-                                    s_val[pos + 0],
-                                    s_inf[pos + 0],
-                                    s_key[pos + stride],
-                                    s_val[pos + stride],
-                                    s_inf[pos + stride],
-                                    sortDir);
+    ComparatorExtended<sortDir>(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0],
+                                s_key[pos + stride], s_val[pos + stride],
+                                s_inf[pos + stride], sortDir);
  }

  // Store sorted data
@ -254,28 +254,26 @@ __global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
  }
 }

-extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
-                                                uint *d_DstVal,
-                                                uint *d_SrcKey,
-                                                uint *d_SrcVal,
+extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
+                                                uint *d_SrcKey, uint *d_SrcVal,
                                                uint *d_LimitsA,
-                                                uint *d_LimitsB,
-                                                uint  stride,
-                                                uint  N,
-                                                uint  sortDir)
-{
+                                                uint *d_LimitsB, uint stride,
+                                                uint N, uint sortDir) {
  uint lastSegmentElements = N % (2 * stride);

-    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
+  uint mergePairs = (lastSegmentElements > stride)
+                        ? getSampleCount(N)
+                        : (N - lastSegmentElements) / SAMPLE_STRIDE;

  if (sortDir) {
-        bitonicMergeElementaryIntervalsKernel<1U>
-            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
+    bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
+        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
+        N);
    getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
-    }
-    else {
-        bitonicMergeElementaryIntervalsKernel<0U>
-            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
+  } else {
+    bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
+        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
+        N);
    getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
  }
 }
--- a/Samples/0_Introduction/mergeSort/main.cpp
+++ b/Samples/0_Introduction/mergeSort/main.cpp
@ -26,19 +26,17 @@
 */

 #include <assert.h>
-#include <cuda_runtime.h>
-#include <helper_cuda.h>
-#include <helper_functions.h>
 #include <stdio.h>
 #include <stdlib.h>
-
+#include <cuda_runtime.h>
+#include <helper_functions.h>
+#include <helper_cuda.h>
 #include "mergeSort_common.h"

 ////////////////////////////////////////////////////////////////////////////////
 // Test driver
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
  uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
  StopWatchInterface *hTimer = NULL;
@ -77,8 +75,10 @@ int main(int argc, char **argv)
  checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
  checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
  checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
-    checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
+  checkCudaErrors(
+      cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
+  checkCudaErrors(
+      cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));

  printf("Initializing GPU merge sort...\n");
  initMergeSort();
@ -93,8 +93,10 @@ int main(int argc, char **argv)
  printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));

  printf("Reading back GPU merge sort results...\n");
-    checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
+  checkCudaErrors(
+      cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
+  checkCudaErrors(
+      cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));

  printf("Inspecting the results...\n");
  uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
--- a/Samples/0_Introduction/mergeSort/mergeSort.cu
+++ b/Samples/0_Introduction/mergeSort/mergeSort.cu
@ -39,19 +39,21 @@
 namespace cg = cooperative_groups;

 #include <helper_cuda.h>
-
 #include "mergeSort_common.h"

 ////////////////////////////////////////////////////////////////////////////////
 // Helper functions
 ////////////////////////////////////////////////////////////////////////////////
-static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
+static inline __host__ __device__ uint iDivUp(uint a, uint b) {
+  return ((a % b) == 0) ? (a / b) : (a / b + 1);
+}

-static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
+static inline __host__ __device__ uint getSampleCount(uint dividend) {
+  return iDivUp(dividend, SAMPLE_STRIDE);
+}

 #define W (sizeof(uint) * 8)
-static inline __device__ uint nextPowerOfTwo(uint x)
-{
+static inline __device__ uint nextPowerOfTwo(uint x) {
  /*
      --x;
      x |= x >> 1;
@ -64,8 +66,9 @@ static inline __device__ uint nextPowerOfTwo(uint x)
  return 1U << (W - __clz(x - 1));
 }

-template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
-{
+template <uint sortDir>
+static inline __device__ uint binarySearchInclusive(uint val, uint *data,
+                                                    uint L, uint stride) {
  if (L == 0) {
    return 0;
  }
@ -75,7 +78,8 @@ template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint
  for (; stride > 0; stride >>= 1) {
    uint newPos = umin(pos + stride, L);

-        if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
+    if ((sortDir && (data[newPos - 1] <= val)) ||
+        (!sortDir && (data[newPos - 1] >= val))) {
      pos = newPos;
    }
  }
@ -83,8 +87,9 @@ template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint
  return pos;
 }

-template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
-{
+template <uint sortDir>
+static inline __device__ uint binarySearchExclusive(uint val, uint *data,
+                                                    uint L, uint stride) {
  if (L == 0) {
    return 0;
  }
@ -94,7 +99,8 @@ template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint
  for (; stride > 0; stride >>= 1) {
    uint newPos = umin(pos + stride, L);

-        if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
+    if ((sortDir && (data[newPos - 1] < val)) ||
+        (!sortDir && (data[newPos - 1] > val))) {
      pos = newPos;
    }
  }
@ -106,8 +112,9 @@ template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint
 // Bottom-level merge sort (binary search-based)
 ////////////////////////////////////////////////////////////////////////////////
 template <uint sortDir>
-__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength)
-{
+__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
+                                      uint *d_SrcKey, uint *d_SrcVal,
+                                      uint arrayLength) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  __shared__ uint s_key[SHARED_SIZE_LIMIT];
@ -119,8 +126,10 @@ __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_Sr
  d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
  s_key[threadIdx.x + 0] = d_SrcKey[0];
  s_val[threadIdx.x + 0] = d_SrcVal[0];
-    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
-    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
+  s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
+      d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
+  s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
+      d_SrcVal[(SHARED_SIZE_LIMIT / 2)];

  for (uint stride = 1; stride < arrayLength; stride <<= 1) {
    uint lPos = threadIdx.x & (stride - 1);
@ -132,8 +141,12 @@ __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_Sr
    uint valA = baseVal[lPos + 0];
    uint keyB = baseKey[lPos + stride];
    uint valB = baseVal[lPos + stride];
-        uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
-        uint posB = binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) + lPos;
+    uint posA =
+        binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) +
+        lPos;
+    uint posB =
+        binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) +
+        lPos;

    cg::sync(cta);
    baseKey[posA] = keyA;
@ -145,18 +158,15 @@ __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_Sr
  cg::sync(cta);
  d_DstKey[0] = s_key[threadIdx.x + 0];
  d_DstVal[0] = s_val[threadIdx.x + 0];
-    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
+      s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
+      s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 }

-static void mergeSortShared(uint *d_DstKey,
-                            uint *d_DstVal,
-                            uint *d_SrcKey,
-                            uint *d_SrcVal,
-                            uint  batchSize,
-                            uint  arrayLength,
-                            uint  sortDir)
-{
+static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
+                            uint *d_SrcVal, uint batchSize, uint arrayLength,
+                            uint sortDir) {
  if (arrayLength < 2) {
    return;
  }
@ -167,11 +177,12 @@ static void mergeSortShared(uint *d_DstKey,
  uint threadCount = SHARED_SIZE_LIMIT / 2;

  if (sortDir) {
-        mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
+    mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(
+        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
    getLastCudaError("mergeSortShared<1><<<>>> failed\n");
-    }
-    else {
-        mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
+  } else {
+    mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(
+        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
    getLastCudaError("mergeSortShared<0><<<>>> failed\n");
  }
 }
@ -180,9 +191,9 @@ static void mergeSortShared(uint *d_DstKey,
 // Merge step 1: generate sample ranks
 ////////////////////////////////////////////////////////////////////////////////
 template <uint sortDir>
-__global__ void
-generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount)
-{
+__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
+                                          uint *d_SrcKey, uint stride, uint N,
+                                          uint threadCount) {
  uint pos = blockIdx.x * blockDim.x + threadIdx.x;

  if (pos >= threadCount) {
@ -203,30 +214,33 @@ generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint s
  if (i < segmentSamplesA) {
    d_RanksA[i] = i * SAMPLE_STRIDE;
    d_RanksB[i] = binarySearchExclusive<sortDir>(
-            d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB));
+        d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB,
+        nextPowerOfTwo(segmentElementsB));
  }

  if (i < segmentSamplesB) {
    d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
    d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
-            d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA));
+        d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA,
+        nextPowerOfTwo(segmentElementsA));
  }
 }

-static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir)
-{
+static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
+                                uint stride, uint N, uint sortDir) {
  uint lastSegmentElements = N % (2 * stride);
-    uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
+  uint threadCount =
+      (lastSegmentElements > stride)
+          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

  if (sortDir) {
-        generateSampleRanksKernel<1U>
-            <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
+    generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>(
+        d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
    getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
-    }
-    else {
-        generateSampleRanksKernel<0U>
-            <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
+  } else {
+    generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>(
+        d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
    getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
  }
 }
@ -234,8 +248,9 @@ static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 2: generate sample ranks and indices
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount)
-{
+__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
+                                           uint stride, uint N,
+                                           uint threadCount) {
  uint pos = blockIdx.x * blockDim.x + threadIdx.x;

  if (pos >= threadCount) {
@ -254,29 +269,36 @@ __global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint s

  if (i < segmentSamplesA) {
    uint dstPos = binarySearchExclusive<1U>(
-                          d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB))
-                    + i;
+                      d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB,
+                      nextPowerOfTwo(segmentSamplesB)) +
+                  i;
    d_Limits[dstPos] = d_Ranks[i];
  }

  if (i < segmentSamplesB) {
-        uint dstPos = binarySearchInclusive<1U>(
-                          d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA))
-                    + i;
+    uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i],
+                                            d_Ranks, segmentSamplesA,
+                                            nextPowerOfTwo(segmentSamplesA)) +
+                  i;
    d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
  }
 }

-static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N)
-{
+static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
+                                 uint *d_RanksA, uint *d_RanksB, uint stride,
+                                 uint N) {
  uint lastSegmentElements = N % (2 * stride);
-    uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
+  uint threadCount =
+      (lastSegmentElements > stride)
+          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

-    mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsA, d_RanksA, stride, N, threadCount);
+  mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
+      d_LimitsA, d_RanksA, stride, N, threadCount);
  getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");

-    mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsB, d_RanksB, stride, N, threadCount);
+  mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
+      d_LimitsB, d_RanksB, stride, N, threadCount);
  getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
 }

@ -284,30 +306,24 @@ static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_Ranks
 // Merge step 3: merge elementary intervals
 ////////////////////////////////////////////////////////////////////////////////
 template <uint sortDir>
-inline __device__ void merge(uint            *dstKey,
-                             uint            *dstVal,
-                             uint            *srcAKey,
-                             uint            *srcAVal,
-                             uint            *srcBKey,
-                             uint            *srcBVal,
-                             uint             lenA,
-                             uint             nPowTwoLenA,
-                             uint             lenB,
-                             uint             nPowTwoLenB,
-                             cg::thread_block cta)
-{
+inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
+                             uint *srcAVal, uint *srcBKey, uint *srcBVal,
+                             uint lenA, uint nPowTwoLenA, uint lenB,
+                             uint nPowTwoLenB, cg::thread_block cta) {
  uint keyA, valA, keyB, valB, dstPosA, dstPosB;

  if (threadIdx.x < lenA) {
    keyA = srcAKey[threadIdx.x];
    valA = srcAVal[threadIdx.x];
-        dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
+    dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) +
+              threadIdx.x;
  }

  if (threadIdx.x < lenB) {
    keyB = srcBKey[threadIdx.x];
    valB = srcBVal[threadIdx.x];
-        dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
+    dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) +
+              threadIdx.x;
  }

  cg::sync(cta);
@ -324,15 +340,10 @@ inline __device__ void merge(uint            *dstKey,
 }

 template <uint sortDir>
-__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
-                                               uint *d_DstVal,
-                                               uint *d_SrcKey,
-                                               uint *d_SrcVal,
-                                               uint *d_LimitsA,
-                                               uint *d_LimitsB,
-                                               uint  stride,
-                                               uint  N)
-{
+__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
+                                               uint *d_SrcKey, uint *d_SrcVal,
+                                               uint *d_LimitsA, uint *d_LimitsB,
+                                               uint stride, uint N) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  __shared__ uint s_key[2 * SAMPLE_STRIDE];
@ -357,8 +368,10 @@ __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,

    startSrcA = d_LimitsA[blockIdx.x];
    startSrcB = d_LimitsB[blockIdx.x];
-        uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
-        uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
+    uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
+                                                    : segmentElementsA;
+    uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
+                                                    : segmentElementsB;
    lenSrcA = endSrcA - startSrcA;
    lenSrcB = endSrcB - startSrcB;
    startDstA = startSrcA + startSrcB;
@ -374,23 +387,17 @@ __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
  }

  if (threadIdx.x < lenSrcB) {
-        s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
-        s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
+    s_key[threadIdx.x + SAMPLE_STRIDE] =
+        d_SrcKey[stride + startSrcB + threadIdx.x];
+    s_val[threadIdx.x + SAMPLE_STRIDE] =
+        d_SrcVal[stride + startSrcB + threadIdx.x];
  }

  // Merge data in shared memory
  cg::sync(cta);
-    merge<sortDir>(s_key,
-                   s_val,
-                   s_key + 0,
-                   s_val + 0,
-                   s_key + SAMPLE_STRIDE,
-                   s_val + SAMPLE_STRIDE,
-                   lenSrcA,
-                   SAMPLE_STRIDE,
-                   lenSrcB,
-                   SAMPLE_STRIDE,
-                   cta);
+  merge<sortDir>(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE,
+                 s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB,
+                 SAMPLE_STRIDE, cta);

  // Store merged data
  cg::sync(cta);
@ -406,77 +413,63 @@ __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
  }
 }

-static void mergeElementaryIntervals(uint *d_DstKey,
-                                     uint *d_DstVal,
-                                     uint *d_SrcKey,
-                                     uint *d_SrcVal,
-                                     uint *d_LimitsA,
-                                     uint *d_LimitsB,
-                                     uint  stride,
-                                     uint  N,
-                                     uint  sortDir)
-{
+static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
+                                     uint *d_SrcKey, uint *d_SrcVal,
+                                     uint *d_LimitsA, uint *d_LimitsB,
+                                     uint stride, uint N, uint sortDir) {
  uint lastSegmentElements = N % (2 * stride);
-    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
+  uint mergePairs = (lastSegmentElements > stride)
+                        ? getSampleCount(N)
+                        : (N - lastSegmentElements) / SAMPLE_STRIDE;

  if (sortDir) {
-        mergeElementaryIntervalsKernel<1U>
-            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
+    mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
+        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
+        N);
    getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
-    }
-    else {
-        mergeElementaryIntervalsKernel<0U>
-            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
+  } else {
+    mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
+        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
+        N);
    getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
  }
 }

-extern "C" void bitonicSortShared(uint *d_DstKey,
-                                  uint *d_DstVal,
-                                  uint *d_SrcKey,
-                                  uint *d_SrcVal,
-                                  uint  batchSize,
-                                  uint  arrayLength,
+extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
+                                  uint *d_SrcKey, uint *d_SrcVal,
+                                  uint batchSize, uint arrayLength,
                                  uint sortDir);

-extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
-                                                uint *d_DstVal,
-                                                uint *d_SrcKey,
-                                                uint *d_SrcVal,
+extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
+                                                uint *d_SrcKey, uint *d_SrcVal,
                                                uint *d_LimitsA,
-                                                uint *d_LimitsB,
-                                                uint  stride,
-                                                uint  N,
-                                                uint  sortDir);
+                                                uint *d_LimitsB, uint stride,
+                                                uint N, uint sortDir);

 static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
 static const uint MAX_SAMPLE_COUNT = 32768;

-extern "C" void initMergeSort(void)
-{
-    checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
-    checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
-    checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
-    checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
+extern "C" void initMergeSort(void) {
+  checkCudaErrors(
+      cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
+  checkCudaErrors(
+      cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
+  checkCudaErrors(
+      cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
+  checkCudaErrors(
+      cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
 }

-extern "C" void closeMergeSort(void)
-{
+extern "C" void closeMergeSort(void) {
  checkCudaErrors(cudaFree(d_RanksA));
  checkCudaErrors(cudaFree(d_RanksB));
  checkCudaErrors(cudaFree(d_LimitsB));
  checkCudaErrors(cudaFree(d_LimitsA));
 }

-extern "C" void mergeSort(uint *d_DstKey,
-                          uint *d_DstVal,
-                          uint *d_BufKey,
-                          uint *d_BufVal,
-                          uint *d_SrcKey,
-                          uint *d_SrcVal,
-                          uint  N,
-                          uint  sortDir)
-{
+extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
+                          uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal,
+                          uint N, uint sortDir) {
  uint stageCount = 0;

  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
@ -489,8 +482,7 @@ extern "C" void mergeSort(uint *d_DstKey,
    ival = d_BufVal;
    okey = d_DstKey;
    oval = d_DstVal;
-    }
-    else {
+  } else {
    ikey = d_DstKey;
    ival = d_DstVal;
    okey = d_BufKey;
@ -499,7 +491,8 @@ extern "C" void mergeSort(uint *d_DstKey,

  assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
  assert(N % SHARED_SIZE_LIMIT == 0);
-    mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);
+  mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT,
+                  SHARED_SIZE_LIMIT, sortDir);

  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
    uint lastSegmentElements = N % (2 * stride);
@ -511,19 +504,18 @@ extern "C" void mergeSort(uint *d_DstKey,
    mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);

    // Merge elementary intervals
-        mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);
+    mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB,
+                             stride, N, sortDir);

    if (lastSegmentElements <= stride) {
      // Last merge segment consists of a single array which just needs to be
      // passed through
-            checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements),
-                                       ikey + (N - lastSegmentElements),
-                                       lastSegmentElements * sizeof(uint),
-                                       cudaMemcpyDeviceToDevice));
-            checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements),
-                                       ival + (N - lastSegmentElements),
-                                       lastSegmentElements * sizeof(uint),
-                                       cudaMemcpyDeviceToDevice));
+      checkCudaErrors(cudaMemcpy(
+          okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
+          lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
+      checkCudaErrors(cudaMemcpy(
+          oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
+          lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
    }

    uint *t;
--- a/Samples/0_Introduction/mergeSort/mergeSort_common.h
+++ b/Samples/0_Introduction/mergeSort/mergeSort_common.h
@ -36,12 +36,14 @@ typedef unsigned int uint;
 ////////////////////////////////////////////////////////////////////////////////
 // Extensive sort validation routine
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" uint
-validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir);
+extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
+                                   uint arrayLength, uint numValues,
+                                   uint sortDir);

 extern "C" void fillValues(uint *val, uint N);

-extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength);
+extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
+                                    uint batchSize, uint arrayLength);

 ////////////////////////////////////////////////////////////////////////////////
 // CUDA merge sort
@ -50,11 +52,13 @@ extern "C" void initMergeSort(void);

 extern "C" void closeMergeSort(void);

-extern "C" void
-mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
+extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey,
+                          uint *bufVal, uint *srcKey, uint *srcVal, uint N,
+                          uint sortDir);

 ////////////////////////////////////////////////////////////////////////////////
 // CPU "emulation"
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void
-mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
+extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
+                              uint *bufVal, uint *srcKey, uint *srcVal, uint N,
+                              uint sortDir);
--- a/Samples/0_Introduction/mergeSort/mergeSort_host.cpp
+++ b/Samples/0_Introduction/mergeSort/mergeSort_host.cpp
@ -29,20 +29,19 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-
 #include "mergeSort_common.h"

 ////////////////////////////////////////////////////////////////////////////////
 // Helper functions
 ////////////////////////////////////////////////////////////////////////////////
-static void checkOrder(uint *data, uint N, uint sortDir)
-{
+static void checkOrder(uint *data, uint N, uint sortDir) {
  if (N <= 1) {
    return;
  }

  for (uint i = 0; i < N - 1; i++)
-        if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) {
+    if ((sortDir && (data[i] > data[i + 1])) ||
+        (!sortDir && (data[i] < data[i + 1]))) {
      fprintf(stderr, "checkOrder() failed!!!\n");
      exit(EXIT_FAILURE);
    }
@ -50,13 +49,12 @@ static void checkOrder(uint *data, uint N, uint sortDir)

 static uint umin(uint a, uint b) { return (a <= b) ? a : b; }

-static uint getSampleCount(uint dividend)
-{
-    return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE);
+static uint getSampleCount(uint dividend) {
+  return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1)
+                                           : (dividend / SAMPLE_STRIDE);
 }

-static uint nextPowerOfTwo(uint x)
-{
+static uint nextPowerOfTwo(uint x) {
  --x;
  x |= x >> 1;
  x |= x >> 2;
@ -66,8 +64,7 @@ static uint nextPowerOfTwo(uint x)
  return ++x;
 }

-static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
-{
+static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
  if (L == 0) {
    return 0;
  }
@ -77,7 +74,8 @@ static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
  for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
    uint newPos = umin(pos + stride, L);

-        if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
+    if ((sortDir && (data[newPos - 1] <= val)) ||
+        (!sortDir && (data[newPos - 1] >= val))) {
      pos = newPos;
    }
  }
@ -85,8 +83,7 @@ static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
  return pos;
 }

-static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
-{
+static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
  if (L == 0) {
    return 0;
  }
@ -96,7 +93,8 @@ static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
  for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
    uint newPos = umin(pos + stride, L);

-        if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
+    if ((sortDir && (data[newPos - 1] < val)) ||
+        (!sortDir && (data[newPos - 1] > val))) {
      pos = newPos;
    }
  }
@ -107,10 +105,12 @@ static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 1: find sample ranks in each segment
 ////////////////////////////////////////////////////////////////////////////////
-static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir)
-{
+static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
+                                uint stride, uint N, uint sortDir) {
  uint lastSegmentElements = N % (2 * stride);
-    uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
+  uint sampleCount =
+      (lastSegmentElements > stride)
+          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

  for (uint pos = 0; pos < sampleCount; pos++) {
@ -124,14 +124,17 @@ static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint s

    if (i < nA) {
      ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
-            ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive(
-                srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir);
+      ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] =
+          binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE],
+                                srcKey + segmentBase + stride, lenB, sortDir);
    }

    if (i < nB) {
      ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
-            ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive(
-                srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir);
+      ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] =
+          binarySearchInclusive(
+              srcKey[segmentBase + stride + i * SAMPLE_STRIDE],
+              srcKey + segmentBase, lenA, sortDir);
    }
  }
 }
@ -139,10 +142,12 @@ static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint s
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 2: merge ranks and indices to derive elementary intervals
 ////////////////////////////////////////////////////////////////////////////////
-static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
-{
+static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
+                                 uint N) {
  uint lastSegmentElements = N % (2 * stride);
-    uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
+  uint sampleCount =
+      (lastSegmentElements > stride)
+          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

  for (uint pos = 0; pos < sampleCount; pos++) {
@ -156,20 +161,23 @@ static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)

    if (i < nA) {
      uint dstPosA =
-                binarySearchExclusive(
-                    ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1)
-                + i;
+          binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i],
+                                ranks + (segmentBase + stride) / SAMPLE_STRIDE,
+                                nB, 1) +
+          i;
      assert(dstPosA < nA + nB);
-            limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
+      limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
+          ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
    }

    if (i < nB) {
-            uint dstPosA =
-                binarySearchInclusive(
-                    ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1)
-                + i;
+      uint dstPosA = binarySearchInclusive(
+                         ranks[(segmentBase + stride) / SAMPLE_STRIDE + i],
+                         ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) +
+                     i;
      assert(dstPosA < nA + nB);
-            limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
+      limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
+          ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
    }
  }
 }
@ -177,16 +185,9 @@ static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
 ////////////////////////////////////////////////////////////////////////////////
-static void merge(uint *dstKey,
-                  uint *dstVal,
-                  uint *srcAKey,
-                  uint *srcAVal,
-                  uint *srcBKey,
-                  uint *srcBVal,
-                  uint  lenA,
-                  uint  lenB,
-                  uint  sortDir)
-{
+static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
+                  uint *srcBKey, uint *srcBVal, uint lenA, uint lenB,
+                  uint sortDir) {
  checkOrder(srcAKey, lenA, sortDir);
  checkOrder(srcBKey, lenB, sortDir);

@ -205,18 +206,13 @@ static void merge(uint *dstKey,
  }
 }

-static void mergeElementaryIntervals(uint *dstKey,
-                                     uint *dstVal,
-                                     uint *srcKey,
-                                     uint *srcVal,
-                                     uint *limitsA,
-                                     uint *limitsB,
-                                     uint  stride,
-                                     uint  N,
-                                     uint  sortDir)
-{
+static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
+                                     uint *srcVal, uint *limitsA, uint *limitsB,
+                                     uint stride, uint N, uint sortDir) {
  uint lastSegmentElements = N % (2 * stride);
-    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
+  uint mergePairs = (lastSegmentElements > stride)
+                        ? getSampleCount(N)
+                        : (N - lastSegmentElements) / SAMPLE_STRIDE;

  for (uint pos = 0; pos < mergePairs; pos++) {
    uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
@ -244,18 +240,15 @@ static void mergeElementaryIntervals(uint *dstKey,
          (srcKey + segmentBase + 0) + startPosA,
          (srcVal + segmentBase + 0) + startPosA,
          (srcKey + segmentBase + stride) + startPosB,
-              (srcVal + segmentBase + stride) + startPosB,
-              endPosA - startPosA,
-              endPosB - startPosB,
-              sortDir);
+          (srcVal + segmentBase + stride) + startPosB, endPosA - startPosA,
+          endPosB - startPosB, sortDir);
  }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Retarded bubble sort
 ////////////////////////////////////////////////////////////////////////////////
-static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
-{
+static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
  if (N <= 1) {
    return;
  }
@ -285,9 +278,9 @@ static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
 ////////////////////////////////////////////////////////////////////////////////
 // Interface function
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void
-mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir)
-{
+extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
+                              uint *bufVal, uint *srcKey, uint *srcVal, uint N,
+                              uint sortDir) {
  uint *ikey, *ival, *okey, *oval;
  uint stageCount = 0;

@ -299,8 +292,7 @@ mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcK
    ival = bufVal;
    okey = dstKey;
    oval = dstVal;
-    }
-    else {
+  } else {
    ikey = dstKey;
    ival = dstVal;
    okey = bufKey;
@ -312,7 +304,8 @@ mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcK
  memcpy(ival, srcVal, N * sizeof(uint));

  for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
-        bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir);
+    bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos),
+               sortDir);
  }

  printf("Merge...\n");
@ -336,15 +329,16 @@ mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcK
    mergeRanksAndIndices(limitsB, ranksB, stride, N);

    // Merge elementary intervals
-        mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir);
+    mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride,
+                             N, sortDir);

    if (lastSegmentElements <= stride) {
      // Last merge segment consists of a single array which just needs to be
      // passed through
-            memcpy(
-                okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
-            memcpy(
-                oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
+      memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
+             lastSegmentElements * sizeof(uint));
+      memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
+             lastSegmentElements * sizeof(uint));
    }

    uint *t;
--- a/Samples/0_Introduction/mergeSort/mergeSort_validate.cpp
+++ b/Samples/0_Introduction/mergeSort/mergeSort_validate.cpp
@ -29,15 +29,14 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-
 #include "mergeSort_common.h"

 ////////////////////////////////////////////////////////////////////////////////
 // Validate sorted keys array (check for integrity and proper order)
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" uint
-validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir)
-{
+extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
+                                   uint arrayLength, uint numValues,
+                                   uint sortDir) {
  uint *srcHist;
  uint *resHist;

@ -52,7 +51,8 @@ validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength,

  int flag = 1;

-    for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) {
+  for (uint j = 0; j < batchSize;
+       j++, srcKey += arrayLength, resKey += arrayLength) {
    // Build histograms for keys arrays
    memset(srcHist, 0, numValues * sizeof(uint));
    memset(resHist, 0, numValues * sizeof(uint));
@ -61,9 +61,11 @@ validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength,
      if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
        srcHist[srcKey[i]]++;
        resHist[resKey[i]]++;
-            }
-            else {
-                fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j);
+      } else {
+        fprintf(
+            stderr,
+            "***Set %u source/result key arrays are not limited properly***\n",
+            j);
        flag = 0;
        goto brk;
      }
@ -72,15 +74,18 @@ validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength,
    // Compare the histograms
    for (uint i = 0; i < numValues; i++)
      if (srcHist[i] != resHist[i]) {
-                fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j);
+        fprintf(stderr,
+                "***Set %u source/result keys histograms do not match***\n", j);
        flag = 0;
        goto brk;
      }

    // Finally check the ordering
    for (uint i = 0; i < arrayLength - 1; i++)
-            if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) {
-                fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j);
+      if ((sortDir && (resKey[i] > resKey[i + 1])) ||
+          (!sortDir && (resKey[i] < resKey[i + 1]))) {
+        fprintf(stderr,
+                "***Set %u result key array is not ordered properly***\n", j);
        flag = 0;
        goto brk;
      }
@ -90,8 +95,7 @@ brk:
  free(resHist);
  free(srcHist);

-    if (flag)
-        printf("OK\n");
+  if (flag) printf("OK\n");

  return flag;
 }
@ -99,30 +103,30 @@ brk:
 ////////////////////////////////////////////////////////////////////////////////
 // Value validation / stability check routines
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void fillValues(uint *val, uint N)
-{
-    for (uint i = 0; i < N; i++)
-        val[i] = i;
+extern "C" void fillValues(uint *val, uint N) {
+  for (uint i = 0; i < N; i++) val[i] = i;
 }

-extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength)
-{
+extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
+                                    uint batchSize, uint arrayLength) {
  int correctFlag = 1, stableFlag = 1;

  printf("...inspecting keys and values array: ");

-    for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) {
+  for (uint i = 0; i < batchSize;
+       i++, resKey += arrayLength, resVal += arrayLength) {
    for (uint j = 0; j < arrayLength; j++) {
-            if (resKey[j] != srcKey[resVal[j]])
-                correctFlag = 0;
+      if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0;

-            if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1]))
+      if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) &&
+          (resVal[j] > resVal[j + 1]))
        stableFlag = 0;
    }
  }

  printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
-    printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n");
+  printf(stableFlag ? "...stability property: stable!\n"
+                    : "...stability property: NOT stable\n");

  return correctFlag;
 }
--- a/Samples/0_Introduction/simpleAWBarrier/CMakeLists.txt
+++ b/Samples/0_Introduction/simpleAWBarrier/CMakeLists.txt
@ -11,10 +11,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CUDA_ARCHITECTURES 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")

-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/simpleAWBarrier/README.md
+++ b/Samples/0_Introduction/simpleAWBarrier/README.md
@ -30,7 +30,7 @@ cudaStreamCreateWithFlags, cudaFree, cudaDeviceGetAttribute, cudaMallocHost, cud

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
--- a/Samples/0_Introduction/simpleAWBarrier/simpleAWBarrier.cu
+++ b/Samples/0_Introduction/simpleAWBarrier/simpleAWBarrier.cu
@ -29,9 +29,9 @@
 #include <stdio.h>

 // Includes CUDA
-#include <cooperative_groups.h>
-#include <cuda/barrier>
 #include <cuda_runtime.h>
+#include <cuda/barrier>
+#include <cooperative_groups.h>

 // Utilities and timing functions
 #include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
@ -43,11 +43,9 @@ namespace cg = cooperative_groups;

 #if __CUDA_ARCH__ >= 700
 template <bool writeSquareRoot>
-__device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
-                                cg::thread_block_tile<32>               &tile32,
-                                double                                  &threadSum,
-                                double                                  *result)
-{
+__device__ void reduceBlockData(
+    cuda::barrier<cuda::thread_scope_block> &barrier,
+    cg::thread_block_tile<32> &tile32, double &threadSum, double *result) {
  extern __shared__ double tmp[];

 #pragma unroll
@ -64,7 +62,9 @@ __device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier

  // The warp 0 will perform last round of reduction
  if (tile32.meta_group_rank() == 0) {
-        double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
+    double beta = tile32.thread_rank() < tile32.meta_group_size()
+                      ? tmp[tile32.thread_rank()]
+                      : 0.0;

 #pragma unroll
    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
@ -81,8 +81,8 @@ __device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier
 }
 #endif

-__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
-{
+__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
+                                             double *partialResults, int size) {
 #if __CUDA_ARCH__ >= 700
 #pragma diag_suppress static_var_with_dynamic_init
  cg::thread_block cta = cg::this_thread_block();
@ -105,7 +105,8 @@ __global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *p

  // Each thread block performs reduction of partial dotProducts and writes to
  // global mem.
-    reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
+  reduceBlockData<false>(barrier, tile32, threadSum,
+                         &partialResults[blockIdx.x]);

  cg::sync(grid);

@ -136,15 +137,15 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  printf("%s starting...\n", argv[0]);

  // This will pick the best possible CUDA capable device
  int dev = findCudaDevice(argc, (const char **)argv);

  int major = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));

  // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
  if (major < 7) {
@ -153,10 +154,12 @@ int main(int argc, char **argv)
  }

  int supportsCooperativeLaunch = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch,
+                                         cudaDevAttrCooperativeLaunch, dev));

  if (!supportsCooperativeLaunch) {
-        printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
+    printf(
+        "\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
        "Waiving the run\n",
        dev);
    exit(EXIT_WAIVED);
@ -168,8 +171,7 @@ int main(int argc, char **argv)
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }

-int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
-{
+int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
  float *vecA, *d_vecA;
  float *vecB, *d_vecB;
  double *d_partialResults;
@ -189,14 +191,16 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
  cudaStream_t stream;
  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

-    checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size,
+                                  cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
+                                  cudaMemcpyHostToDevice, stream));

  // Kernel configuration, where a one-dimensional
  // grid and one-dimensional blocks are configured.
  int minGridSize = 0, blockSize = 0;
-    checkCudaErrors(
-        cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
+  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
+      &minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));

  int smemSize = ((blockSize / 32) + 1) * sizeof(double);

@ -205,24 +209,28 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
      &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));

  int multiProcessorCount = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
+  checkCudaErrors(cudaDeviceGetAttribute(
+      &multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));

  minGridSize = multiProcessorCount * numBlocksPerSm;
  checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));

-    printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
+  printf(
+      "Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
      "blockSize = %d\n",
-           minGridSize,
-           blockSize);
+      minGridSize, blockSize);

  dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);

-    void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size};
+  void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB,
+                        (void *)&d_partialResults, (void *)&size};

-    checkCudaErrors(cudaLaunchCooperativeKernel(
-        (void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
+  checkCudaErrors(
+      cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid,
+                                  dimBlock, kernelArgs, smemSize, stream));

-    checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream));
+  checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size,
+                                  cudaMemcpyDeviceToHost, stream));
  checkCudaErrors(cudaStreamSynchronize(stream));

  float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
@ -231,8 +239,7 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
    if ((vecA[i] - expectedResult) > 0.00001) {
      printf("mismatch at i = %d\n", i);
      break;
-        }
-        else {
+    } else {
      matches++;
    }
  }
--- a/Samples/0_Introduction/simpleAssert/CMakeLists.txt
+++ b/Samples/0_Introduction/simpleAssert/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Removes -DNDEBUG For Print specific logs in this sample.
--- a/Samples/0_Introduction/simpleAssert/README.md
+++ b/Samples/0_Introduction/simpleAssert/README.md
@ -27,6 +27,6 @@ cudaDeviceSynchronize, cudaGetErrorString

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/simpleAssert/simpleAssert.cu
+++ b/Samples/0_Introduction/simpleAssert/simpleAssert.cu
@ -34,8 +34,8 @@
 #endif

 // Includes, system
-#include <cassert>
 #include <stdio.h>
+#include <cassert>

 // Includes CUDA
 #include <cuda_runtime.h>
@ -58,8 +58,7 @@ bool testResult = true;
 //! Tests assert function.
 //! Thread whose id > N will print assertion failed error message.
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void testKernel(int N)
-{
+__global__ void testKernel(int N) {
  int gtid = blockIdx.x * blockDim.x + threadIdx.x;
  assert(gtid < N);
 }
@ -71,18 +70,17 @@ void runTest(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  printf("%s starting...\n", sampleName);

  runTest(argc, argv);

-    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
+  printf("%s completed, returned %s\n", sampleName,
+         testResult ? "OK" : "ERROR!");
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }

-void runTest(int argc, char **argv)
-{
+void runTest(int argc, char **argv) {
  int Nblocks = 2;
  int Nthreads = 32;
  cudaError_t error;
@ -96,8 +94,7 @@ void runTest(int argc, char **argv)
  if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
    printf("simpleAssert is not current supported on Mac OSX\n\n");
    exit(EXIT_SUCCESS);
-    }
-    else {
+  } else {
    printf("OS Info: <%s>\n\n", OS_System_Type.version);
  }

@ -121,7 +118,8 @@ void runTest(int argc, char **argv)

  // Check for errors and failed asserts in asynchronous kernel launch.
  if (error == cudaErrorAssert) {
-        printf("Device assert failed as expected, "
+    printf(
+        "Device assert failed as expected, "
        "CUDA error message is: %s\n\n",
        cudaGetErrorString(error));
  }
--- a/Samples/0_Introduction/simpleAssert_nvrtc/CMakeLists.txt
+++ b/Samples/0_Introduction/simpleAssert_nvrtc/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/simpleAssert_nvrtc/README.md
+++ b/Samples/0_Introduction/simpleAssert_nvrtc/README.md
@ -30,7 +30,7 @@ cuModuleGetFunction, cuLaunchKernel, cuCtxSynchronize

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
--- a/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert.cpp
+++ b/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert.cpp
@ -34,12 +34,11 @@
 #endif

 // Includes, system
-#include <cassert>
 #include <stdio.h>
+#include <cassert>

 // Includes CUDA
 #include <cuda_runtime.h>
-
 #include "nvrtc_helper.h"

 // Utilities and timing functions
@ -59,8 +58,7 @@ void runTest(int argc, char **argv);
 // Program main
 ////////////////////////////////////////////////////////////////////////////////

-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  printf("%s starting...\n", sampleName);

  runTest(argc, argv);
@ -68,8 +66,7 @@ int main(int argc, char **argv)
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }

-void runTest(int argc, char **argv)
-{
+void runTest(int argc, char **argv) {
  int Nblocks = 2;
  int Nthreads = 32;

@ -94,15 +91,10 @@ void runTest(int argc, char **argv)
  int count = 60;
  void *args[] = {(void *)&count};

-    checkCudaErrors(cuLaunchKernel(kernel_addr,
-                                   dimGrid.x,
-                                   dimGrid.y,
-                                   dimGrid.z, /* grid dim */
-                                   dimBlock.x,
-                                   dimBlock.y,
-                                   dimBlock.z, /* block dim */
-                                   0,
-                                   0,        /* shared mem, stream */
+  checkCudaErrors(cuLaunchKernel(
+      kernel_addr, dimGrid.x, dimGrid.y, dimGrid.z, /* grid dim */
+      dimBlock.x, dimBlock.y, dimBlock.z,           /* block dim */
+      0, 0,                                         /* shared mem, stream */
      &args[0],                                     /* arguments */
      0));

--- a/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert_kernel.cu
+++ b/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert_kernel.cu
@ -32,8 +32,7 @@
 //! Thread whose id > N will print assertion failed error message.
 ////////////////////////////////////////////////////////////////////////////////

-extern "C" __global__ void testKernel(int N)
-{
+extern "C" __global__ void testKernel(int N) {
  int gtid = blockIdx.x * blockDim.x + threadIdx.x;
  assert(gtid < N);
 }
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/CMakeLists.txt
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/CMakeLists.txt
@ -11,10 +11,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CUDA_ARCHITECTURES  50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")

-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/README.md
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/README.md
@ -27,6 +27,6 @@ cudaStreamCreateWithFlags, cudaFree, cudaMallocHost, cudaFreeHost, cudaStreamSyn

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics.cu
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics.cu
@ -30,10 +30,10 @@
 */

 // includes, system
-#include <math.h>
-#include <stdio.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <string.h>
+#include <math.h>

 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -68,21 +68,20 @@ extern "C" bool computeGold(int *gpuData, const int len);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  printf("%s starting...\n", sampleName);

  runTest(argc, argv);

-    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
+  printf("%s completed, returned %s\n", sampleName,
+         testResult ? "OK" : "ERROR!");
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv)
-{
+void runTest(int argc, char **argv) {
  cudaStream_t stream;
  // This will pick the best possible CUDA capable device
  findCudaDevice(argc, (const char **)argv);
@ -101,8 +100,7 @@ void runTest(int argc, char **argv)
  checkCudaErrors(cudaMallocHost(&hOData, memSize));

  // initialize the memory
-    for (unsigned int i = 0; i < numData; i++)
-        hOData[i] = 0;
+  for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;

  // To make the AND and XOR tests generate something other than 0...
  hOData[8] = hOData[10] = 0xff;
@ -112,13 +110,15 @@ void runTest(int argc, char **argv)
  int *dOData;
  checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
  // copy host memory to device to initialize to zero
-    checkCudaErrors(cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(
+      cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));

  // execute the kernel
  testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);

  // Copy result from device to host
-    checkCudaErrors(cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
+  checkCudaErrors(
+      cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
  checkCudaErrors(cudaStreamSynchronize(stream));

  sdkStopTimer(&timer);
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_cpu.cpp
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_cpu.cpp
@ -42,8 +42,7 @@ extern "C" int computeGold(int *gpuData, const int len);
 //! @param idata      input data as provided to device
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
-int computeGold(int *gpuData, const int len)
-{
+int computeGold(int *gpuData, const int len) {
  int val = 0;

  for (int i = 0; i < len; ++i) {
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_kernel.cuh
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_kernel.cuh
@ -35,8 +35,7 @@
 //! @param g_idata  input data in global memory
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void testKernel(int *g_odata)
-{
+__global__ void testKernel(int *g_odata) {
  // access thread id
  const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;

--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/CMakeLists.txt
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/README.md
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/README.md
@ -33,7 +33,7 @@ cudaBlockSize, cudaGridSize

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics.cpp
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics.cpp
@ -30,10 +30,10 @@
 */

 // includes, system
-#include <math.h>
-#include <stdio.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <string.h>
+#include <math.h>

 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -64,13 +64,13 @@ extern "C" bool computeGold(int *gpuData, const int len);
 // Program main
 ////////////////////////////////////////////////////////////////////////////////

-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  printf("%s starting...\n", sampleName);

  runTest(argc, argv);

-    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
+  printf("%s completed, returned %s\n", sampleName,
+         testResult ? "OK" : "ERROR!");

  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
@ -79,8 +79,7 @@ int main(int argc, char **argv)
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////

-void runTest(int argc, char **argv)
-{
+void runTest(int argc, char **argv) {
  int dev = 0;

  char *cubin, *kernel_file;
@ -107,8 +106,7 @@ void runTest(int argc, char **argv)
  int *hOData = (int *)malloc(memSize);

  // initialize the memory
-    for (unsigned int i = 0; i < numData; i++)
-        hOData[i] = 0;
+  for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;

  // To make the AND and XOR tests generate something other than 0...
  hOData[8] = hOData[10] = 0xff;
@ -123,15 +121,11 @@ void runTest(int argc, char **argv)
  dim3 cudaGridSize(numBlocks, 1, 1);

  void *arr[] = {(void *)&dOData};
-    checkCudaErrors(cuLaunchKernel(kernel_addr,
-                                   cudaGridSize.x,
-                                   cudaGridSize.y,
+  checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
                                 cudaGridSize.z, /* grid dim */
-                                   cudaBlockSize.x,
-                                   cudaBlockSize.y,
+                                 cudaBlockSize.x, cudaBlockSize.y,
                                 cudaBlockSize.z, /* block dim */
-                                   0,
-                                   0,       /* shared mem, stream */
+                                 0, 0,            /* shared mem, stream */
                                 &arr[0],         /* arguments */
                                 0));

--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_cpu.cpp
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_cpu.cpp
@ -43,8 +43,7 @@ extern "C" int computeGold(int *gpuData, const int len);
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////

-int computeGold(int *gpuData, const int len)
-{
+int computeGold(int *gpuData, const int len) {
  int val = 0;

  for (int i = 0; i < len; ++i) {
--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_kernel.cuh
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_kernel.cuh
@ -36,8 +36,7 @@
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////

-extern "C" __global__ void testKernel(int *g_odata)
-{
+extern "C" __global__ void testKernel(int *g_odata) {
  // access thread id
  const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;

--- a/Samples/0_Introduction/simpleAttributes/CMakeLists.txt
+++ b/Samples/0_Introduction/simpleAttributes/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/simpleAttributes/README.md
+++ b/Samples/0_Introduction/simpleAttributes/README.md
@ -27,6 +27,6 @@ cudaFree, cudaMallocHost, cudaFreeHost, cudaStreamSynchronize, cudaStreamSetAttr

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu
+++ b/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu
@ -26,10 +26,10 @@
 */

 // includes, system
-#include <math.h>
-#include <stdio.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <string.h>
+#include <math.h>

 // includes CUDA
 #include <cuda_runtime.h>
@ -42,8 +42,7 @@
 // declaration, forward
 void runTest(int argc, char **argv);

-cudaAccessPolicyWindow initAccessPolicyWindow(void)
-{
+cudaAccessPolicyWindow initAccessPolicyWindow(void) {
  cudaAccessPolicyWindow accessPolicyWindow = {0};
  accessPolicyWindow.base_ptr = (void *)0;
  accessPolicyWindow.num_bytes = 0;
@ -61,8 +60,8 @@ cudaAccessPolicyWindow initAccessPolicyWindow(void)
 //! @param bigDataSize  input bigData size
 //! @param hitcount how many data access are done within block
 ////////////////////////////////////////////////////////////////////////////////
-static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, int bigDataSize, int hitCount)
-{
+static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
+                                            int bigDataSize, int hitCount) {
  __shared__ unsigned int hit;
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;
@ -83,9 +82,9 @@ static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,

    if ((tID % 2) == 0) {
      data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
-        }
-        else {
-            trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize];
+    } else {
+      trash[psRand % bigDataSize] =
+          trash[psRand % bigDataSize] + trash[idx % bigDataSize];
    }

    atomicAdd(&hit, 1);
@ -99,8 +98,7 @@ int main(int argc, char **argv) { runTest(argc, argv); }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv)
-{
+void runTest(int argc, char **argv) {
  bool bTestResult = true;
  cudaAccessPolicyWindow accessPolicyWindow;
  cudaDeviceProp deviceProp;
@ -129,7 +127,8 @@ void runTest(int argc, char **argv)

  // Make sure device the l2 optimization
  if (deviceProp.persistingL2CacheMaxSize == 0) {
-        printf("Waiving execution as device %d does not support persisting L2 "
+    printf(
+        "Waiving execution as device %d does not support persisting L2 "
        "Caching\n",
        devID);
    exit(EXIT_WAIVED);
@ -140,7 +139,8 @@ void runTest(int argc, char **argv)

  // Set the amount of l2 cache that will be persisting to maximum the device
  // can support
-    checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize));
+  checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
+                                     deviceProp.persistingL2CacheMaxSize));

  // Stream attribute to set
  streamAttrID = cudaStreamAttributeAccessPolicyWindow;
@ -155,7 +155,8 @@ void runTest(int argc, char **argv)

  // Allocate data
  checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
-    checkCudaErrors(cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
+  checkCudaErrors(
+      cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));

  for (int i = 0; i < bigDataSize; ++i) {
    if (i < dataSize) {
@ -165,12 +166,16 @@ void runTest(int argc, char **argv)
    bigDataHostPointer[bigDataSize - i - 1] = i;
  }

-    checkCudaErrors(cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
-    checkCudaErrors(cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
  checkCudaErrors(
-        cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(
-        bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
+      cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
+  checkCudaErrors(
+      cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
+  checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
+                                  dataSize * sizeof(int),
+                                  cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
+                                  bigDataSize * sizeof(int),
+                                  cudaMemcpyHostToDevice, stream));

  // Make a window for the buffer of interest
  accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
@ -181,7 +186,8 @@ void runTest(int argc, char **argv)
  streamAttrValue.accessPolicyWindow = accessPolicyWindow;

  // Assign window to stream
-    checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
+  checkCudaErrors(
+      cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));

  // Demote any previous persisting lines
  checkCudaErrors(cudaCtxResetPersistingL2Cache());
--- a/Samples/0_Introduction/simpleCUDA2GL/CMakeLists.txt
+++ b/Samples/0_Introduction/simpleCUDA2GL/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
@ -61,16 +59,12 @@ if(${OpenGL_FOUND})

                add_custom_command(TARGET simpleCUDA2GL
                    POST_BUILD
-                    COMMAND ${CMAKE_COMMAND} -E
-                    copy ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/freeglut.dll
-                    ${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
+                    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/freeglut.dll ${CMAKE_CURRENT_BINARY_DIR}
                )

                add_custom_command(TARGET simpleCUDA2GL
                    POST_BUILD
-                    COMMAND ${CMAKE_COMMAND} -E
-                    copy ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll
-                    ${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
+                    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll ${CMAKE_CURRENT_BINARY_DIR}
                )
            endif()
        endif()
--- a/Samples/0_Introduction/simpleCUDA2GL/README.md
+++ b/Samples/0_Introduction/simpleCUDA2GL/README.md
@ -30,7 +30,8 @@ cudaHostAlloc, cudaGraphicsUnmapResources, cudaMalloc, cudaFree, cudaGraphicsRes

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
+
--- a/Samples/0_Introduction/simpleCUDA2GL/main.cpp
+++ b/Samples/0_Introduction/simpleCUDA2GL/main.cpp
@ -50,8 +50,8 @@
 #endif

 // CUDA includes
-#include <cuda_gl_interop.h>
 #include <cuda_runtime.h>
+#include <cuda_gl_interop.h>

 // CUDA utilities and system includes
 #include <helper_cuda.h>
@ -124,7 +124,8 @@ StopWatchInterface *timer    = NULL;
 GLuint shDraw;

 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw);
+extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
+                                   unsigned int *g_odata, int imgw);

 // Forward declarations
 void runStdProgram(int argc, char **argv);
@ -139,7 +140,8 @@ void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource);
 void deletePBO(GLuint *pbo);
 #endif

-void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int size_y);
+void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x,
+                      unsigned int size_y);
 void deleteTexture(GLuint *tex);

 // rendering callbacks
@ -153,8 +155,7 @@ void mainMenu(int i);
 ////////////////////////////////////////////////////////////////////////////////
 //! Create PBO
 ////////////////////////////////////////////////////////////////////////////////
-void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource)
-{
+void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource) {
  // set up vertex data parameter
  num_texels = image_width * image_height;
  num_values = num_texels * 4;
@ -170,32 +171,33 @@ void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource)
  glBindBuffer(GL_ARRAY_BUFFER, 0);

  // register this buffer object with CUDA
-    checkCudaErrors(cudaGraphicsGLRegisterBuffer(pbo_resource, *pbo, cudaGraphicsMapFlagsNone));
+  checkCudaErrors(cudaGraphicsGLRegisterBuffer(pbo_resource, *pbo,
+                                               cudaGraphicsMapFlagsNone));

  SDK_CHECK_ERROR_GL();
 }

-void deletePBO(GLuint *pbo)
-{
+void deletePBO(GLuint *pbo) {
  glDeleteBuffers(1, pbo);
  SDK_CHECK_ERROR_GL();
  *pbo = 0;
 }
 #endif

-const GLenum fbo_targets[] = {GL_COLOR_ATTACHMENT0_EXT,
-                              GL_COLOR_ATTACHMENT1_EXT,
-                              GL_COLOR_ATTACHMENT2_EXT,
-                              GL_COLOR_ATTACHMENT3_EXT};
+const GLenum fbo_targets[] = {
+    GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT,
+    GL_COLOR_ATTACHMENT2_EXT, GL_COLOR_ATTACHMENT3_EXT};

 #ifndef USE_TEXSUBIMAGE2D
-static const char *glsl_drawtex_vertshader_src = "void main(void)\n"
+static const char *glsl_drawtex_vertshader_src =
+    "void main(void)\n"
    "{\n"
    "	gl_Position = gl_Vertex;\n"
    "	gl_TexCoord[0].xy = gl_MultiTexCoord0.xy;\n"
    "}\n";

-static const char *glsl_drawtex_fragshader_src = "#version 130\n"
+static const char *glsl_drawtex_fragshader_src =
+    "#version 130\n"
    "uniform usampler2D texImage;\n"
    "void main()\n"
    "{\n"
@ -225,15 +227,15 @@ static const char *glsl_draw_fragshader_src =
 #endif

 // copy image and process using CUDA
-void generateCUDAImage()
-{
+void generateCUDAImage() {
  // run the Cuda kernel
  unsigned int *out_data;

 #ifdef USE_TEXSUBIMAGE2D
  checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_dest_resource, 0));
  size_t num_bytes;
-    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&out_data, &num_bytes, cuda_pbo_dest_resource));
+  checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
+      (void **)&out_data, &num_bytes, cuda_pbo_dest_resource));
 // printf("CUDA mapped pointer of pbo_out: May access %ld bytes, expected %d\n",
 // num_bytes, size_tex_data);
 #else
@ -256,7 +258,8 @@ void generateCUDAImage()
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest);

  glBindTexture(GL_TEXTURE_2D, tex_cudaResult);
-    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+  glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_RGBA,
+                  GL_UNSIGNED_BYTE, NULL);
  SDK_CHECK_ERROR_GL();
  glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
@ -265,20 +268,21 @@ void generateCUDAImage()
  // map buffer objects to get CUDA device pointers
  cudaArray *texture_ptr;
  checkCudaErrors(cudaGraphicsMapResources(1, &cuda_tex_result_resource, 0));
-    checkCudaErrors(cudaGraphicsSubResourceGetMappedArray(&texture_ptr, cuda_tex_result_resource, 0, 0));
+  checkCudaErrors(cudaGraphicsSubResourceGetMappedArray(
+      &texture_ptr, cuda_tex_result_resource, 0, 0));

  int num_texels = image_width * image_height;
  int num_values = num_texels * 4;
  int size_tex_data = sizeof(GLubyte) * num_values;
-    checkCudaErrors(cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource, size_tex_data, cudaMemcpyDeviceToDevice));
+  checkCudaErrors(cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource,
+                                    size_tex_data, cudaMemcpyDeviceToDevice));

  checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_tex_result_resource, 0));
 #endif
 }

 // display image to the screen as textured quad
-void displayImage(GLuint texture)
-{
+void displayImage(GLuint texture) {
  glBindTexture(GL_TEXTURE_2D, texture);
  glEnable(GL_TEXTURE_2D);
  glDisable(GL_DEPTH_TEST);
@ -328,8 +332,7 @@ void displayImage(GLuint texture)
 ////////////////////////////////////////////////////////////////////////////////
 //! Display callback
 ////////////////////////////////////////////////////////////////////////////////
-void display()
-{
+void display() {
  sdkStartTimer(&timer);

  if (enable_cuda) {
@ -355,7 +358,9 @@ void display()
      sprintf(currentOutputPPM, "kilt.ppm");
      g_CheckRender->savePPM(currentOutputPPM, true, NULL);

-            if (!g_CheckRender->PPMvsPPM(currentOutputPPM, sdkFindFilePath(ref_file, pArgv[0]), MAX_EPSILON, 0.30f)) {
+      if (!g_CheckRender->PPMvsPPM(currentOutputPPM,
+                                   sdkFindFilePath(ref_file, pArgv[0]),
+                                   MAX_EPSILON, 0.30f)) {
        g_TotalErrors++;
      }

@ -369,7 +374,8 @@ void display()
  if (++fpsCount == fpsLimit) {
    char cTitle[256];
    float fps = 1000.0f / sdkGetAverageTimerValue(&timer);
-        sprintf(cTitle, "CUDA GL Post Processing (%d x %d): %.1f fps", window_width, window_height, fps);
+    sprintf(cTitle, "CUDA GL Post Processing (%d x %d): %.1f fps", window_width,
+            window_height, fps);
    glutSetWindowTitle(cTitle);
    // printf("%s\n", cTitle);
    fpsCount = 0;
@ -378,8 +384,7 @@ void display()
  }
 }

-void timerEvent(int value)
-{
+void timerEvent(int value) {
  glutPostRedisplay();
  glutTimerFunc(REFRESH_DELAY, timerEvent, 0);
 }
@ -387,8 +392,7 @@ void timerEvent(int value)
 ////////////////////////////////////////////////////////////////////////////////
 //! Keyboard events handler
 ////////////////////////////////////////////////////////////////////////////////
-void keyboard(unsigned char key, int /*x*/, int /*y*/)
-{
+void keyboard(unsigned char key, int /*x*/, int /*y*/) {
  switch (key) {
    case (27):
      Cleanup(EXIT_SUCCESS);
@ -400,8 +404,7 @@ void keyboard(unsigned char key, int /*x*/, int /*y*/)

      if (enable_cuda) {
        glClearColorIuiEXT(128, 128, 128, 255);
-        }
-        else {
+      } else {
        glClearColor(0.5, 0.5, 0.5, 1.0);
      }

@ -410,8 +413,7 @@ void keyboard(unsigned char key, int /*x*/, int /*y*/)
  }
 }

-void reshape(int w, int h)
-{
+void reshape(int w, int h) {
  window_width = w;
  window_height = h;
 }
@ -421,8 +423,8 @@ void mainMenu(int i) { keyboard((unsigned char)i, 0, 0); }
 ////////////////////////////////////////////////////////////////////////////////
 //!
 ////////////////////////////////////////////////////////////////////////////////
-void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int size_y)
-{
+void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x,
+                      unsigned int size_y) {
  // create a texture
  glGenTextures(1, tex_cudaResult);
  glBindTexture(GL_TEXTURE_2D, *tex_cudaResult);
@ -434,22 +436,24 @@ void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);

 #ifdef USE_TEXSUBIMAGE2D
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, size_x, size_y, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, size_x, size_y, 0, GL_RGBA,
+               GL_UNSIGNED_BYTE, NULL);
  SDK_CHECK_ERROR_GL();
 #else
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0, GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL);
+  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0,
+               GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL);
  SDK_CHECK_ERROR_GL();
  // register this texture with CUDA
  checkCudaErrors(cudaGraphicsGLRegisterImage(
-        &cuda_tex_result_resource, *tex_cudaResult, GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard));
+      &cuda_tex_result_resource, *tex_cudaResult, GL_TEXTURE_2D,
+      cudaGraphicsMapFlagsWriteDiscard));
 #endif
 }

 ////////////////////////////////////////////////////////////////////////////////
 //!
 ////////////////////////////////////////////////////////////////////////////////
-void deleteTexture(GLuint *tex)
-{
+void deleteTexture(GLuint *tex) {
  glDeleteTextures(1, tex);
  SDK_CHECK_ERROR_GL();

@ -459,8 +463,7 @@ void deleteTexture(GLuint *tex)
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
 #if defined(__linux__)
  char *Xstatus = getenv("DISPLAY");
  if (Xstatus == NULL) {
@ -484,7 +487,8 @@ int main(int argc, char **argv)
  if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
    printf("[%s]\n", argv[0]);
    printf("   Does not explicitly support -device=n\n");
-        printf("   This sample requires OpenGL.  Only -file=<reference> are "
+    printf(
+        "   This sample requires OpenGL.  Only -file=<reference> are "
        "supported\n");
    printf("exiting...\n");
    exit(EXIT_WAIVED);
@ -493,8 +497,7 @@ int main(int argc, char **argv)
  if (ref_file) {
    printf("(Test with OpenGL verification)\n");
    runStdProgram(argc, argv);
-    }
-    else {
+  } else {
    printf("(Interactive OpenGL Demo)\n");
    runStdProgram(argc, argv);
  }
@ -505,8 +508,7 @@ int main(int argc, char **argv)
 ////////////////////////////////////////////////////////////////////////////////
 //!
 ////////////////////////////////////////////////////////////////////////////////
-void FreeResource()
-{
+void FreeResource() {
  sdkDeleteTimer(&timer);

 // unregister this buffer object with CUDA
@ -528,18 +530,18 @@ void FreeResource()
  printf("simpleCUDA2GL Exiting...\n");
 }

-void Cleanup(int iExitCode)
-{
+void Cleanup(int iExitCode) {
  FreeResource();
-    printf("PPM Images are %s\n", (iExitCode == EXIT_SUCCESS) ? "Matching" : "Not Matching");
+  printf("PPM Images are %s\n",
+         (iExitCode == EXIT_SUCCESS) ? "Matching" : "Not Matching");
  exit(iExitCode);
 }

 ////////////////////////////////////////////////////////////////////////////////
 //!
 ////////////////////////////////////////////////////////////////////////////////
-GLuint compileGLSLprogram(const char *vertex_shader_src, const char *fragment_shader_src)
-{
+GLuint compileGLSLprogram(const char *vertex_shader_src,
+                          const char *fragment_shader_src) {
  GLuint v, f, p = 0;

  p = glCreateProgram();
@ -561,8 +563,7 @@ GLuint compileGLSLprogram(const char *vertex_shader_src, const char *fragment_sh
      //#endif
      glDeleteShader(v);
      return 0;
-        }
-        else {
+    } else {
      glAttachShader(p, v);
    }
  }
@ -584,8 +585,7 @@ GLuint compileGLSLprogram(const char *vertex_shader_src, const char *fragment_sh
      //#endif
      glDeleteShader(f);
      return 0;
-        }
-        else {
+    } else {
      glAttachShader(p, f);
    }
  }
@ -611,8 +611,7 @@ GLuint compileGLSLprogram(const char *vertex_shader_src, const char *fragment_sh
 //! Allocate the "render target" of CUDA
 ////////////////////////////////////////////////////////////////////////////////
 #ifndef USE_TEXSUBIMAGE2D
-void initCUDABuffers()
-{
+void initCUDABuffers() {
  // set up vertex data parameter
  num_texels = image_width * image_height;
  num_values = num_texels * 4;
@ -626,8 +625,7 @@ void initCUDABuffers()
 ////////////////////////////////////////////////////////////////////////////////
 //!
 ////////////////////////////////////////////////////////////////////////////////
-void initGLBuffers()
-{
+void initGLBuffers() {
 // create pbo
 #ifdef USE_TEXSUBIMAGE2D
  createPBO(&pbo_dest, &cuda_pbo_dest_resource);
@ -638,7 +636,8 @@ void initGLBuffers()
  shDraw = compileGLSLprogram(NULL, glsl_draw_fragshader_src);

 #ifndef USE_TEXSUBIMAGE2D
-    shDrawTex = compileGLSLprogram(glsl_drawtex_vertshader_src, glsl_drawtex_fragshader_src);
+  shDrawTex = compileGLSLprogram(glsl_drawtex_vertshader_src,
+                                 glsl_drawtex_fragshader_src);
 #endif
  SDK_CHECK_ERROR_GL();
 }
@ -646,8 +645,7 @@ void initGLBuffers()
 ////////////////////////////////////////////////////////////////////////////////
 //! Run standard demo loop with or without GL verification
 ////////////////////////////////////////////////////////////////////////////////
-void runStdProgram(int argc, char **argv)
-{
+void runStdProgram(int argc, char **argv) {
  // First initialize OpenGL context, so we can properly set the GL for CUDA.
  // This is necessary in order to achieve optimal performance with OpenGL/CUDA
  // interop.
@ -685,7 +683,8 @@ void runStdProgram(int argc, char **argv)
    g_CheckRender->EnableQAReadback(true);
  }

-    printf("\n"
+  printf(
+      "\n"
      "\tControls\n"
      "\t(right click mouse button for Menu)\n"
      "\t[esc] - Quit\n\n");
@ -700,8 +699,7 @@ void runStdProgram(int argc, char **argv)
 ////////////////////////////////////////////////////////////////////////////////
 //! Initialize GL
 ////////////////////////////////////////////////////////////////////////////////
-bool initGL(int *argc, char **argv)
-{
+bool initGL(int *argc, char **argv) {
  // Create GL context
  glutInit(argc, argv);
  glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH);
@ -709,8 +707,8 @@ bool initGL(int *argc, char **argv)
  iGLUTWindowHandle = glutCreateWindow("CUDA OpenGL post-processing");

  // initialize necessary OpenGL extensions
-    if (!isGLVersionSupported(2, 0)
-        || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object "
+  if (!isGLVersionSupported(2, 0) ||
+      !areGLExtensionsSupported("GL_ARB_pixel_buffer_object "
                                "GL_EXT_framebuffer_object")) {
    printf("ERROR: Support for necessary OpenGL extensions missing.");
    fflush(stderr);
@ -731,7 +729,8 @@ bool initGL(int *argc, char **argv)
  // projection
  glMatrixMode(GL_PROJECTION);
  glLoadIdentity();
-    gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1f, 10.0f);
+  gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1f,
+                 10.0f);

  glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);

--- a/Samples/0_Introduction/simpleCUDA2GL/simpleCUDA2GL.cu
+++ b/Samples/0_Introduction/simpleCUDA2GL/simpleCUDA2GL.cu
@ -35,16 +35,14 @@ __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }
 __device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }

 // convert floating point rgb color to 8-bit integer
-__device__ int rgbToInt(float r, float g, float b)
-{
+__device__ int rgbToInt(float r, float g, float b) {
  r = clamp(r, 0.0f, 255.0f);
  g = clamp(g, 0.0f, 255.0f);
  b = clamp(b, 0.0f, 255.0f);
  return (int(b) << 16) | (int(g) << 8) | int(r);
 }

-__global__ void cudaProcess(unsigned int *g_odata, int imgw)
-{
+__global__ void cudaProcess(unsigned int *g_odata, int imgw) {
  extern __shared__ uchar4 sdata[];

  int tx = threadIdx.x;
@ -58,7 +56,7 @@ __global__ void cudaProcess(unsigned int *g_odata, int imgw)
  g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
 }

-extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw)
-{
+extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
+                                   unsigned int *g_odata, int imgw) {
  cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
 }
--- a/Samples/0_Introduction/simpleCallback/CMakeLists.txt
+++ b/Samples/0_Introduction/simpleCallback/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/simpleCallback/README.md
+++ b/Samples/0_Introduction/simpleCallback/README.md
@ -27,6 +27,6 @@ cudaHostAlloc, cudaStreamDestroy, cudaFree, cudaSetDevice, cudaGetDeviceCount, c

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/simpleCallback/multithreading.cpp
+++ b/Samples/0_Introduction/simpleCallback/multithreading.cpp
@ -29,21 +29,18 @@

 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 // Create thread
-CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
-{
+CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
  return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
 }

 // Wait for thread to finish
-void cutEndThread(CUTThread thread)
-{
+void cutEndThread(CUTThread thread) {
  WaitForSingleObject(thread, INFINITE);
  CloseHandle(thread);
 }

 // Wait for multiple threads
-void cutWaitForThreads(const CUTThread *threads, int num)
-{
+void cutWaitForThreads(const CUTThread *threads, int num) {
  WaitForMultipleObjects(num, threads, true, INFINITE);

  for (int i = 0; i < num; i++) {
@ -52,8 +49,7 @@ void cutWaitForThreads(const CUTThread *threads, int num)
 }

 // Create barrier.
-CUTBarrier cutCreateBarrier(int releaseCount)
-{
+CUTBarrier cutCreateBarrier(int releaseCount) {
  CUTBarrier barrier;

  InitializeCriticalSection(&barrier.criticalSection);
@ -65,8 +61,7 @@ CUTBarrier cutCreateBarrier(int releaseCount)
 }

 // Increment barrier. (execution continues)
-void cutIncrementBarrier(CUTBarrier *barrier)
-{
+void cutIncrementBarrier(CUTBarrier *barrier) {
  int myBarrierCount;
  EnterCriticalSection(&barrier->criticalSection);
  myBarrierCount = ++barrier->count;
@ -78,15 +73,16 @@ void cutIncrementBarrier(CUTBarrier *barrier)
 }

 // Wait for barrier release.
-void cutWaitForBarrier(CUTBarrier *barrier) { WaitForSingleObject(barrier->barrierEvent, INFINITE); }
+void cutWaitForBarrier(CUTBarrier *barrier) {
+  WaitForSingleObject(barrier->barrierEvent, INFINITE);
+}

 // Destroy barrier
 void cutDestroyBarrier(CUTBarrier *barrier) {}

 #else
 // Create thread
-CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
-{
+CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
  pthread_t thread;
  pthread_create(&thread, NULL, func, data);
  return thread;
@ -96,16 +92,14 @@ CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
 void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }

 // Wait for multiple threads
-void cutWaitForThreads(const CUTThread *threads, int num)
-{
+void cutWaitForThreads(const CUTThread *threads, int num) {
  for (int i = 0; i < num; i++) {
    cutEndThread(threads[i]);
  }
 }

 // Create barrier.
-CUTBarrier cutCreateBarrier(int releaseCount)
-{
+CUTBarrier cutCreateBarrier(int releaseCount) {
  CUTBarrier barrier;

  barrier.count = 0;
@ -118,8 +112,7 @@ CUTBarrier cutCreateBarrier(int releaseCount)
 }

 // Increment barrier. (execution continues)
-void cutIncrementBarrier(CUTBarrier *barrier)
-{
+void cutIncrementBarrier(CUTBarrier *barrier) {
  int myBarrierCount;
  pthread_mutex_lock(&barrier->mutex);
  myBarrierCount = ++barrier->count;
@ -131,8 +124,7 @@ void cutIncrementBarrier(CUTBarrier *barrier)
 }

 // Wait for barrier release.
-void cutWaitForBarrier(CUTBarrier *barrier)
-{
+void cutWaitForBarrier(CUTBarrier *barrier) {
  pthread_mutex_lock(&barrier->mutex);

  while (barrier->count < barrier->releaseCount) {
@ -143,8 +135,7 @@ void cutWaitForBarrier(CUTBarrier *barrier)
 }

 // Destroy barrier
-void cutDestroyBarrier(CUTBarrier *barrier)
-{
+void cutDestroyBarrier(CUTBarrier *barrier) {
  pthread_mutex_destroy(&barrier->mutex);
  pthread_cond_destroy(&barrier->conditionVariable);
 }
--- a/Samples/0_Introduction/simpleCallback/multithreading.h
+++ b/Samples/0_Introduction/simpleCallback/multithreading.h
@ -37,8 +37,7 @@
 typedef HANDLE CUTThread;
 typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);

-struct CUTBarrier
-{
+struct CUTBarrier {
  CRITICAL_SECTION criticalSection;
  HANDLE barrierEvent;
  int releaseCount;
@ -58,8 +57,7 @@ typedef void *(*CUT_THREADROUTINE)(void *);
 #define CUT_THREADPROC void *
 #define CUT_THREADEND return 0

-struct CUTBarrier
-{
+struct CUTBarrier {
  pthread_mutex_t mutex;
  pthread_cond_t conditionVariable;
  int releaseCount;
@ -69,8 +67,7 @@ struct CUTBarrier
 #endif

 #ifdef __cplusplus
-extern "C"
-{
+extern "C" {
 #endif

 // Create thread.
--- a/Samples/0_Introduction/simpleCallback/simpleCallback.cu
+++ b/Samples/0_Introduction/simpleCallback/simpleCallback.cu
@ -43,8 +43,8 @@
 #include <stdio.h>

 // helper functions and utilities to work with CUDA
-#include <helper_cuda.h>
 #include <helper_functions.h>
+#include <helper_cuda.h>

 #include "multithreading.h"

@ -53,10 +53,10 @@ const int N_elements_per_workload = 100000;

 CUTBarrier thread_barrier;

-void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void *data);
+void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status,
+                                void *data);

-struct heterogeneous_workload
-{
+struct heterogeneous_workload {
  int id;
  int cudaDeviceID;

@ -67,16 +67,13 @@ struct heterogeneous_workload
  bool success;
 };

-__global__ void incKernel(int *data, int N)
-{
+__global__ void incKernel(int *data, int N) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;

-    if (i < N)
-        data[i]++;
+  if (i < N) data[i]++;
 }

-CUT_THREADPROC launch(void *void_arg)
-{
+CUT_THREADPROC launch(void *void_arg) {
  heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;

  // Select GPU for this CPU thread
@ -84,8 +81,11 @@ CUT_THREADPROC launch(void *void_arg)

  // Allocate Resources
  checkCudaErrors(cudaStreamCreate(&workload->stream));
-    checkCudaErrors(cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
-    checkCudaErrors(cudaHostAlloc(&workload->h_data, N_elements_per_workload * sizeof(int), cudaHostAllocPortable));
+  checkCudaErrors(
+      cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
+  checkCudaErrors(cudaHostAlloc(&workload->h_data,
+                                N_elements_per_workload * sizeof(int),
+                                cudaHostAllocPortable));

  // CPU thread generates data
  for (int i = 0; i < N_elements_per_workload; ++i) {
@ -97,28 +97,25 @@ CUT_THREADPROC launch(void *void_arg)
  dim3 block(512);
  dim3 grid((N_elements_per_workload + block.x - 1) / block.x);

-    checkCudaErrors(cudaMemcpyAsync(workload->d_data,
-                                    workload->h_data,
+  checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data,
                                  N_elements_per_workload * sizeof(int),
-                                    cudaMemcpyHostToDevice,
-                                    workload->stream));
-    incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data, N_elements_per_workload);
-    checkCudaErrors(cudaMemcpyAsync(workload->h_data,
-                                    workload->d_data,
+                                  cudaMemcpyHostToDevice, workload->stream));
+  incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data,
+                                                  N_elements_per_workload);
+  checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data,
                                  N_elements_per_workload * sizeof(int),
-                                    cudaMemcpyDeviceToHost,
-                                    workload->stream));
+                                  cudaMemcpyDeviceToHost, workload->stream));

  // New in CUDA 5.0: Add a CPU callback which is called once all currently
  // pending operations in the CUDA stream have finished
-    checkCudaErrors(cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
+  checkCudaErrors(
+      cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));

  CUT_THREADEND;
  // CPU thread end of life, GPU continues to process data...
 }

-CUT_THREADPROC postprocess(void *void_arg)
-{
+CUT_THREADPROC postprocess(void *void_arg) {
  heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
  // ... GPU is done with processing, continue on new CPU thread...

@ -143,8 +140,8 @@ CUT_THREADPROC postprocess(void *void_arg)
  CUT_THREADEND;
 }

-void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data)
-{
+void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
+                                void *data) {
  // Check status of GPU after stream operations are done
  checkCudaErrors(status);

@ -152,8 +149,7 @@ void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *d
  cutStartThread(postprocess, data);
 }

-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  int N_gpus, max_gpus = 0;
  int gpuInfo[32];  // assume a maximum of 32 GPUs in a system configuration

@ -172,8 +168,10 @@ int main(int argc, char **argv)
    cudaSetDevice(devid);
    cudaGetDeviceProperties(&deviceProp, devid);
    SMversion = deviceProp.major << 4 + deviceProp.minor;
-        printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, deviceProp.major, deviceProp.minor);
-        printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");
+    printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name,
+           deviceProp.major, deviceProp.minor);
+    printf(", %s GPU Callback Functions\n",
+           (SMversion >= 0x11) ? "capable" : "NOT capable");

    if (SMversion >= 0x11) {
      gpuInfo[max_gpus++] = devid;
@ -183,7 +181,8 @@ int main(int argc, char **argv)
  printf("%d GPUs available to run Callback Functions\n", max_gpus);

  heterogeneous_workload *workloads;
-    workloads = (heterogeneous_workload *)malloc(N_workloads * sizeof(heterogeneous_workload));
+  workloads = (heterogeneous_workload *)malloc(N_workloads *
+                                               sizeof(heterogeneous_workload));
  ;
  thread_barrier = cutCreateBarrier(N_workloads);

--- a/Samples/0_Introduction/simpleCooperativeGroups/CMakeLists.txt
+++ b/Samples/0_Introduction/simpleCooperativeGroups/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/simpleCooperativeGroups/README.md
+++ b/Samples/0_Introduction/simpleCooperativeGroups/README.md
@ -27,6 +27,6 @@ cudaDeviceSynchronize, cudaGetErrorString

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/simpleCooperativeGroups/simpleCooperativeGroups.cu
+++ b/Samples/0_Introduction/simpleCooperativeGroups/simpleCooperativeGroups.cu
@ -38,8 +38,8 @@
 *
 */

-#include <cooperative_groups.h>
 #include <stdio.h>
+#include <cooperative_groups.h>

 using namespace cooperative_groups;

@ -49,8 +49,7 @@ using namespace cooperative_groups;
 * calculates the sum of val across the group g. The workspace array, x,
 * must be large enough to contain g.size() integers.
 */
-__device__ int sumReduction(thread_group g, int *x, int val)
-{
+__device__ int sumReduction(thread_group g, int *x, int val) {
  // rank of this thread in the group
  int lane = g.thread_rank();

@ -86,8 +85,7 @@ __device__ int sumReduction(thread_group g, int *x, int val)
 *
 * Creates cooperative groups and performs reductions
 */
-__global__ void cgkernel()
-{
+__global__ void cgkernel() {
  // threadBlockGroup includes all threads in the block
  thread_block threadBlockGroup = this_thread_block();
  int threadBlockGroupSize = threadBlockGroup.size();
@ -109,22 +107,24 @@ __global__ void cgkernel()

  // master thread in group prints out result
  if (threadBlockGroup.thread_rank() == 0) {
-        printf(" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
-               (int)threadBlockGroup.size() - 1,
-               output,
-               expectedOutput);
+    printf(
+        " Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
+        (int)threadBlockGroup.size() - 1, output, expectedOutput);

-        printf(" Now creating %d groups, each of size 16 threads:\n\n", (int)threadBlockGroup.size() / 16);
+    printf(" Now creating %d groups, each of size 16 threads:\n\n",
+           (int)threadBlockGroup.size() / 16);
  }

  threadBlockGroup.sync();

  // each tiledPartition16 group includes 16 threads
-    thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup);
+  thread_block_tile<16> tiledPartition16 =
+      tiled_partition<16>(threadBlockGroup);

  // This offset allows each group to have its own unique area in the workspace
  // array
-    int workspaceOffset = threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
+  int workspaceOffset =
+      threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();

  // input to reduction, for each thread, is its' rank in the group
  input = tiledPartition16.thread_rank();
@ -138,10 +138,10 @@ __global__ void cgkernel()

  // each master thread prints out result
  if (tiledPartition16.thread_rank() == 0)
-        printf("   Sum of all ranks 0..15 in this tiledPartition16 group is %d "
+    printf(
+        "   Sum of all ranks 0..15 in this tiledPartition16 group is %d "
        "(expected %d)\n",
-               output,
-               expectedOutput);
+        output, expectedOutput);

  return;
 }
@ -149,8 +149,7 @@ __global__ void cgkernel()
 /**
 * Host main routine
 */
-int main()
-{
+int main() {
  // Error code to check return values for CUDA calls
  cudaError_t err;

@ -167,7 +166,8 @@ int main()
  err = cudaDeviceSynchronize();

  if (err != cudaSuccess) {
-        fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err));
+    fprintf(stderr, "Failed to launch kernel (error code %s)!\n",
+            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }

--- a/Samples/0_Introduction/simpleCubemapTexture/CMakeLists.txt
+++ b/Samples/0_Introduction/simpleCubemapTexture/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
--- a/Samples/0_Introduction/simpleCubemapTexture/README.md
+++ b/Samples/0_Introduction/simpleCubemapTexture/README.md
@ -27,6 +27,6 @@ cudaMemcpy, cudaCreateChannelDesc, cudaFreeArray, cudaFree, cudaPitchedPtr, cuda

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/simpleCubemapTexture/simpleCubemapTexture.cu
+++ b/Samples/0_Introduction/simpleCubemapTexture/simpleCubemapTexture.cu
@ -36,17 +36,17 @@
 */

 // includes, system
-#include <math.h>
-#include <stdio.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <string.h>
+#include <math.h>

 // includes CUDA
 #include <cuda_runtime.h>

 // helper functions and utilities to work with CUDA
-#include <helper_cuda.h>
 #include <helper_functions.h>
+#include <helper_cuda.h>

 static const char *sSDKname = "simpleCubemapTexture";

@ -56,8 +56,8 @@ static const char *sSDKname = "simpleCubemapTexture";
 //! Transform a cubemap face of a linear buffe using cubemap texture lookups
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
-{
+__global__ void transformKernel(float *g_odata, int width,
+                                cudaTextureObject_t tex) {
  // calculate this thread's data point
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -110,15 +110,15 @@ __global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t t
    }

    // read from texture, do expected transformation and write to global memory
-        g_odata[face * width * width + y * width + x] = -texCubemap<float>(tex, cx, cy, cz);
+    g_odata[face * width * width + y * width + x] =
+        -texCubemap<float>(tex, cx, cy, cz);
  }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  // use command-line specified CUDA device, otherwise use device with highest
  // Gflops/s
  int devID = findCudaDevice(argc, (const char **)argv);
@ -129,11 +129,13 @@ int main(int argc, char **argv)
  cudaDeviceProp deviceProps;

  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
+  printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
+         deviceProps.multiProcessorCount);
  printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);

  if (deviceProps.major < 2) {
-        printf("%s requires SM 2.0 or higher for support of Texture Arrays.  Test "
+    printf(
+        "%s requires SM 2.0 or higher for support of Texture Arrays.  Test "
        "will exit... \n",
        sSDKname);

@ -155,7 +157,8 @@ int main(int argc, char **argv)

  for (unsigned int layer = 0; layer < num_layers; layer++) {
    for (int i = 0; i < (int)(cubemap_size); i++) {
-            h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer;
+      h_data_ref[layer * cubemap_size + i] =
+          -h_data[layer * cubemap_size + i] + layer;
    }
  }

@ -164,16 +167,19 @@ int main(int argc, char **argv)
  checkCudaErrors(cudaMalloc((void **)&d_data, size));

  // allocate array and copy image data
-    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+  cudaChannelFormatDesc channelDesc =
+      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
  cudaArray *cu_3darray;
  //    checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
  //    make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
-    checkCudaErrors(
-        cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap));
+  checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
+                                    make_cudaExtent(width, width, num_faces),
+                                    cudaArrayCubemap));
  cudaMemcpy3DParms myparms = {0};
  myparms.srcPos = make_cudaPos(0, 0, 0);
  myparms.dstPos = make_cudaPos(0, 0, 0);
-    myparms.srcPtr            = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
+  myparms.srcPtr =
+      make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
  myparms.dstArray = cu_3darray;
  myparms.extent = make_cudaExtent(width, width, num_faces);
  myparms.kind = cudaMemcpyHostToDevice;
@ -201,12 +207,10 @@ int main(int argc, char **argv)
  dim3 dimBlock(8, 8, 1);
  dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);

-    printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
+  printf(
+      "Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
      "block has 8 x 8 threads\n",
-           width,
-           num_layers,
-           dimGrid.x,
-           dimGrid.y);
+      width, num_layers, dimGrid.x, dimGrid.y);

  transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
                                         tex);  // warmup (for better timing)
@ -229,7 +233,8 @@ int main(int argc, char **argv)
  checkCudaErrors(cudaDeviceSynchronize());
  sdkStopTimer(&timer);
  printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
-    printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
+  printf("%.2f Mtexlookups/sec\n",
+         (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
  sdkDeleteTimer(&timer);

  // allocate mem for the result on host side
@ -240,13 +245,14 @@ int main(int argc, char **argv)
  // write regression file if necessary
  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
    // write file for regression test
-        sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f, false);
-    }
-    else {
+    sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f,
+                        false);
+  } else {
    printf("Comparing kernel output to expected data\n");

 #define MIN_EPSILON_ERROR 5e-3f
-        bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
+    bResult =
+        compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
  }

  // cleanup memory
--- a/Samples/0_Introduction/simpleDrvRuntime/CMakeLists.txt
+++ b/Samples/0_Introduction/simpleDrvRuntime/CMakeLists.txt
@ -10,10 +10,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-if(ENABLE_CUDA_DEBUG)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")        # enable cuda-gdb (may significantly affect performance on some targets)
-else()
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")  # enable cuda-gdb (expensive)
 endif()

 # Include directories and libraries
@ -42,12 +40,6 @@ target_link_libraries(simpleDrvRuntime PUBLIC
 set(CUDA_FATBIN_FILE "${CMAKE_CURRENT_BINARY_DIR}/vectorAdd_kernel64.fatbin")
 set(CUDA_KERNEL_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/vectorAdd_kernel.cu")

-# Construct GENCODE_FLAGS explicitly from CUDA architectures
-set(GENCODE_FLAGS "")
-foreach(arch ${CMAKE_CUDA_ARCHITECTURES})
-    list(APPEND GENCODE_FLAGS "-gencode=arch=compute_${arch},code=sm_${arch}")
-endforeach()
-
 add_custom_command(
    OUTPUT ${CUDA_FATBIN_FILE}
    COMMAND ${CMAKE_CUDA_COMPILER} ${INCLUDES} ${ALL_CCFLAGS} -Wno-deprecated-gpu-targets  ${GENCODE_FLAGS} -o ${CUDA_FATBIN_FILE} -fatbin ${CUDA_KERNEL_SOURCE}
--- a/Samples/0_Introduction/simpleDrvRuntime/README.md
+++ b/Samples/0_Introduction/simpleDrvRuntime/README.md
@ -30,6 +30,6 @@ cudaStreamCreateWithFlags, cudaFree, cudaMallocHost, cudaFreeHost, cudaStreamSyn

 ## Prerequisites

-Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
--- a/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp
+++ b/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp
@ -33,12 +33,12 @@
 */

 // Includes
-#include <cstring>
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <iostream>
 #include <stdio.h>
 #include <string.h>
+#include <cstring>
+#include <iostream>

 // includes, project
 #include <helper_cuda.h>
@ -66,10 +66,11 @@ int  CleanupNoFailure(CUcontext &cuContext);
 void RandomInit(float *, int);
 bool findModulePath(const char *, string &, char **, ostringstream &);

-static void check(CUresult result, char const *const func, const char *const file, int const line)
-{
+static void check(CUresult result, char const *const func,
+                  const char *const file, int const line) {
  if (result) {
-        fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, static_cast<unsigned int>(result), func);
+    fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), func);
    exit(EXIT_FAILURE);
  }
 }
@ -77,8 +78,7 @@ static void check(CUresult result, char const *const func, const char *const fil
 #define checkCudaDrvErrors(val) check((val), #val, __FILE__, __LINE__)

 // Host code
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
  printf("simpleDrvRuntime..\n");
  int N = 50000, devID = 0;
  size_t size = N * sizeof(float);
@ -100,8 +100,7 @@ int main(int argc, char **argv)

  if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) {
    exit(EXIT_FAILURE);
-    }
-    else {
+  } else {
    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
  }

@ -114,7 +113,8 @@ int main(int argc, char **argv)
  checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));

  // Get function handle from module
-    checkCudaDrvErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
+  checkCudaDrvErrors(
+      cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));

  // Allocate input vectors h_A and h_B in host memory
  checkCudaErrors(cudaMallocHost(&h_A, size));
@ -133,8 +133,10 @@ int main(int argc, char **argv)
  cudaStream_t stream;
  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  // Copy vectors from host memory to device memory
-    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(
+      cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
+  checkCudaErrors(
+      cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));

  int threadsPerBlock = 256;
  int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
@ -142,12 +144,14 @@ int main(int argc, char **argv)
  void *args[] = {&d_A, &d_B, &d_C, &N};

  // Launch the CUDA kernel
-    checkCudaDrvErrors(
-        cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, stream, args, NULL));
+  checkCudaDrvErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
+                                    threadsPerBlock, 1, 1, 0, stream, args,
+                                    NULL));

  // Copy result from device memory to host memory
  // h_C contains the result in host memory
-    checkCudaErrors(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
+  checkCudaErrors(
+      cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
  checkCudaErrors(cudaStreamSynchronize(stream));
  // Verify result
  int i;
@ -167,8 +171,7 @@ int main(int argc, char **argv)
  exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 }

-int CleanupNoFailure(CUcontext &cuContext)
-{
+int CleanupNoFailure(CUcontext &cuContext) {
  // Free device memory
  checkCudaErrors(cudaFree(d_A));
  checkCudaErrors(cudaFree(d_B));
@ -192,21 +195,19 @@ int CleanupNoFailure(CUcontext &cuContext)
  return EXIT_SUCCESS;
 }
 // Allocates an array with random float entries.
-void RandomInit(float *data, int n)
-{
+void RandomInit(float *data, int n) {
  for (int i = 0; i < n; ++i) {
    data[i] = rand() / (float)RAND_MAX;
  }
 }

-bool inline findModulePath(const char *module_file, string &module_path, char **argv, ostringstream &ostrm)
-{
+bool inline findModulePath(const char *module_file, string &module_path,
+                           char **argv, ostringstream &ostrm) {
  char *actual_path = sdkFindFilePath(module_file, argv[0]);

  if (actual_path) {
    module_path = actual_path;
-    }
-    else {
+  } else {
    printf("> findModulePath file not found: <%s> \n", module_file);
    return false;
  }
@ -214,8 +215,7 @@ bool inline findModulePath(const char *module_file, string &module_path, char **
  if (module_path.empty()) {
    printf("> findModulePath could not find file: <%s> \n", module_file);
    return false;
-    }
-    else {
+  } else {
    printf("> findModulePath found file at <%s>\n", module_path.c_str());
    if (module_path.rfind("fatbin") != string::npos) {
      ifstream fileIn(module_path.c_str(), ios::binary);
--- a/Show More
+++ b/Show More