Merge branch 'master' into cuda_a_dev

2026-01-09 11:27:49 +08:00 · 2025-03-27 10:38:16 -07:00 · 2025-03-27 10:38:16 -07:00 · eddc6fd7e1
commit eddc6fd7e1
parent 7ceb3122fc ceab6e8bcc
783 changed files with 107231 additions and 106549 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,49 @@
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: Consecutive
+AlignConsecutiveDeclarations: Consecutive
+AlignConsecutiveMacros: Consecutive
+AlignEscapedNewlines: Left
+AlignOperands: AlignAfterOperator
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+    AfterClass: true
+    AfterControlStatement: false
+    AfterExternBlock: true
+    AfterFunction: true
+    AfterStruct: true
+    AfterUnion: true
+    BeforeCatch: true
+    BeforeElse: true
+    IndentBraces: false
+BreakBeforeBraces: Custom
+BreakBeforeConceptDeclarations: true
+BreakBeforeBinaryOperators: NonAssignment
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeComma
+BreakInheritanceList: BeforeComma
+ColumnLimit: 120
+DerivePointerAlignment: false
+FixNamespaceComments: true
+IncludeCategories:
+  - Regex:           '^<.*>'
+    Priority:        1
+  - Regex:           '^".*"'
+    Priority:        2
+SortIncludes: true
+IncludeBlocks: Regroup
+IndentWidth: 4
+MaxEmptyLinesToKeep: 2
+PointerAlignment: Right
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+Standard: c++17
+TabWidth: 4
+UseTab: Never
+...
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,100 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+ci:
+    autofix_commit_msg: |
+      [pre-commit.ci] auto code formatting
+    autofix_prs: false
+    autoupdate_branch: ''
+    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
+    autoupdate_schedule: quarterly
+    skip: []
+    submodules: false
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: end-of-file-fixer
+        exclude: |
+          (?x)^(
+            .*\.raw$|
+            .*\.bin$|
+            .*\.dat$|
+            .*\.nv12$|
+            data/.*|
+            Common/.*
+          )
+        files: |
+          (?x)^(
+            .*\.txt$|
+            .*\.md$|
+            .*\.cpp$|
+            .*\.cxx$|
+            .*\.hpp$|
+            .*\.h$|
+            .*\.cu$|
+            .*\.cuh$
+          )
+      - id: mixed-line-ending
+        exclude: |
+          (?x)^(
+            .*\.raw$|
+            .*\.bin$|
+            .*\.dat$|
+            .*\.nv12$|
+            data/.*|
+            Common/.*
+          )
+        files: |
+          (?x)^(
+            .*\.txt$|
+            .*\.md$|
+            .*\.cpp$|
+            .*\.cxx$|
+            .*\.hpp$|
+            .*\.h$|
+            .*\.cu$|
+            .*\.cuh$
+          )
+      - id: trailing-whitespace
+        exclude: |
+          (?x)^(
+            .*\.raw$|
+            .*\.bin$|
+            .*\.dat$|
+            .*\.nv12$|
+            data/.*|
+            Common/.*
+          )
+        files: |
+          (?x)^(
+            .*\.txt$|
+            .*\.md$|
+            .*\.cpp$|
+            .*\.cxx$|
+            .*\.hpp$|
+            .*\.h$|
+            .*\.cu$|
+            .*\.cuh$
+          )
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v19.1.6
+    hooks:
+      - id: clang-format
+        types_or: [file]
+        files: |
+          (?x)^(
+            ^.*\.c$|
+            ^.*\.cpp$|
+            ^.*\.cu$|
+            ^.*\.cuh$|
+            ^.*\.cxx$|
+            ^.*\.h$|
+            ^.*\.hpp$|
+            ^.*\.inl$|
+            ^.*\.mm$
+          )
+        exclude: |
+          (?x)^(
+            Common/.*
+          )
+        args: ["-fallback-style=none", "-style=file", "-i"]
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # CUDA Samples

-Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.8](https://developer.nvidia.com/cuda-downloads).
+Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.9](https://developer.nvidia.com/cuda-downloads).

 ## Release Notes

--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu
@ -31,10 +31,10 @@
 */

 // system includes
+#include <algorithm>
 #include <cstdio>
 #include <ctime>
 #include <vector>
-#include <algorithm>
 #ifdef USE_PTHREADS
 #include <pthread.h>
 #else
@ -51,291 +51,287 @@
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 // SRAND48 and DRAND48 don't exist on windows, but these are the equivalent
 // functions
-void srand48(long seed) { srand((unsigned int)seed); }
+void   srand48(long seed) { srand((unsigned int)seed); }
 double drand48() { return double(rand()) / RAND_MAX; }
 #endif

 const char *sSDKname = "UnifiedMemoryStreams";

 // simple task
-template <typename T>
-struct Task {
-  unsigned int size, id;
-  T *data;
-  T *result;
-  T *vector;
+template <typename T> struct Task
+{
+    unsigned int size, id;
+    T           *data;
+    T           *result;
+    T           *vector;

-  Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){};
-  Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) {
-    // allocate unified memory -- the operation performed in this example will
-    // be a DGEMV
-    checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
-    checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
-    checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
-    checkCudaErrors(cudaDeviceSynchronize());
-  }
-
-  ~Task() {
-    // ensure all memory is deallocated
-    checkCudaErrors(cudaDeviceSynchronize());
-    checkCudaErrors(cudaFree(data));
-    checkCudaErrors(cudaFree(result));
-    checkCudaErrors(cudaFree(vector));
-  }
-
-  void allocate(const unsigned int s, const unsigned int unique_id) {
-    // allocate unified memory outside of constructor
-    id = unique_id;
-    size = s;
-    checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
-    checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
-    checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
-    checkCudaErrors(cudaDeviceSynchronize());
-
-    // populate data with random elements
-    for (unsigned int i = 0; i < size * size; i++) {
-      data[i] = drand48();
+    Task()
+        : size(0)
+        , id(0)
+        , data(NULL)
+        , result(NULL)
+        , vector(NULL) {};
+    Task(unsigned int s)
+        : size(s)
+        , id(0)
+        , data(NULL)
+        , result(NULL)
+    {
+        // allocate unified memory -- the operation performed in this example will
+        // be a DGEMV
+        checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
+        checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
+        checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
+        checkCudaErrors(cudaDeviceSynchronize());
    }

-    for (unsigned int i = 0; i < size; i++) {
-      result[i] = 0.;
-      vector[i] = drand48();
+    ~Task()
+    {
+        // ensure all memory is deallocated
+        checkCudaErrors(cudaDeviceSynchronize());
+        checkCudaErrors(cudaFree(data));
+        checkCudaErrors(cudaFree(result));
+        checkCudaErrors(cudaFree(vector));
+    }
+
+    void allocate(const unsigned int s, const unsigned int unique_id)
+    {
+        // allocate unified memory outside of constructor
+        id   = unique_id;
+        size = s;
+        checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
+        checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
+        checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
+        checkCudaErrors(cudaDeviceSynchronize());
+
+        // populate data with random elements
+        for (unsigned int i = 0; i < size * size; i++) {
+            data[i] = drand48();
+        }
+
+        for (unsigned int i = 0; i < size; i++) {
+            result[i] = 0.;
+            vector[i] = drand48();
+        }
    }
-  }
 };

 #ifdef USE_PTHREADS
-struct threadData_t {
-  int tid;
-  Task<double> *TaskListPtr;
-  cudaStream_t *streams;
-  cublasHandle_t *handles;
-  int taskSize;
+struct threadData_t
+{
+    int             tid;
+    Task<double>   *TaskListPtr;
+    cudaStream_t   *streams;
+    cublasHandle_t *handles;
+    int             taskSize;
 };

 typedef struct threadData_t threadData;
 #endif

 // simple host dgemv: assume data is in row-major format and square
-template <typename T>
-void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
-  // rows
-  for (int i = 0; i < n; i++) {
-    result[i] *= beta;
+template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
+{
+    // rows
+    for (int i = 0; i < n; i++) {
+        result[i] *= beta;

-    for (int j = 0; j < n; j++) {
-      result[i] += A[i * n + j] * x[j];
+        for (int j = 0; j < n; j++) {
+            result[i] += A[i * n + j] * x[j];
+        }
    }
-  }
 }

 // execute a single task on either host or device depending on size
 #ifdef USE_PTHREADS
-void *execute(void *inpArgs) {
-  threadData *dataPtr = (threadData *)inpArgs;
-  cudaStream_t *stream = dataPtr->streams;
-  cublasHandle_t *handle = dataPtr->handles;
-  int tid = dataPtr->tid;
+void *execute(void *inpArgs)
+{
+    threadData     *dataPtr = (threadData *)inpArgs;
+    cudaStream_t   *stream  = dataPtr->streams;
+    cublasHandle_t *handle  = dataPtr->handles;
+    int             tid     = dataPtr->tid;

-  for (int i = 0; i < dataPtr->taskSize; i++) {
-    Task<double> &t = dataPtr->TaskListPtr[i];
+    for (int i = 0; i < dataPtr->taskSize; i++) {
+        Task<double> &t = dataPtr->TaskListPtr[i];

-    if (t.size < 100) {
-      // perform on host
-      printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
-             t.size);
+        if (t.size < 100) {
+            // perform on host
+            printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);

-      // attach managed memory to a (dummy) stream to allow host access while
-      // the device is running
-      checkCudaErrors(
-          cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
-      checkCudaErrors(
-          cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
-      checkCudaErrors(
-          cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
-      // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
-      checkCudaErrors(cudaStreamSynchronize(stream[0]));
-      // call the host operation
-      gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
-    } else {
-      // perform on device
-      printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
-             t.size);
-      double one = 1.0;
-      double zero = 0.0;
+            // attach managed memory to a (dummy) stream to allow host access while
+            // the device is running
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
+            // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
+            checkCudaErrors(cudaStreamSynchronize(stream[0]));
+            // call the host operation
+            gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
+        }
+        else {
+            // perform on device
+            printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
+            double one  = 1.0;
+            double zero = 0.0;

-      // attach managed memory to my stream
-      checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
-      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
-                                               cudaMemAttachSingle));
-      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
-                                               cudaMemAttachSingle));
-      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
-                                               cudaMemAttachSingle));
-      // call the device operation
-      checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
-                                  &one, t.data, t.size, t.vector, 1, &zero,
-                                  t.result, 1));
+            // attach managed memory to my stream
+            checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
+            // call the device operation
+            checkCudaErrors(cublasDgemv(
+                handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
+        }
    }
-  }

-  pthread_exit(NULL);
+    pthread_exit(NULL);
 }
 #else
-template <typename T>
-void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream,
-             int tid) {
-  if (t.size < 100) {
-    // perform on host
-    printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
-           t.size);
+template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
+{
+    if (t.size < 100) {
+        // perform on host
+        printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);

-    // attach managed memory to a (dummy) stream to allow host access while the
-    // device is running
-    checkCudaErrors(
-        cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
-    checkCudaErrors(
-        cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
-    checkCudaErrors(
-        cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
-    // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
-    checkCudaErrors(cudaStreamSynchronize(stream[0]));
-    // call the host operation
-    gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
-  } else {
-    // perform on device
-    printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
-           t.size);
-    double one = 1.0;
-    double zero = 0.0;
+        // attach managed memory to a (dummy) stream to allow host access while the
+        // device is running
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
+        // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
+        checkCudaErrors(cudaStreamSynchronize(stream[0]));
+        // call the host operation
+        gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
+    }
+    else {
+        // perform on device
+        printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
+        double one  = 1.0;
+        double zero = 0.0;

-    // attach managed memory to my stream
-    checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
-    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
-                                             cudaMemAttachSingle));
-    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
-                                             cudaMemAttachSingle));
-    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
-                                             cudaMemAttachSingle));
-    // call the device operation
-    checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
-                                &one, t.data, t.size, t.vector, 1, &zero,
-                                t.result, 1));
-  }
+        // attach managed memory to my stream
+        checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
+        // call the device operation
+        checkCudaErrors(cublasDgemv(
+            handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
+    }
 }
 #endif

 // populate a list of tasks with random sizes
-template <typename T>
-void initialise_tasks(std::vector<Task<T> > &TaskList) {
-  for (unsigned int i = 0; i < TaskList.size(); i++) {
-    // generate random size
-    int size;
-    size = std::max((int)(drand48() * 1000.0), 64);
-    TaskList[i].allocate(size, i);
-  }
+template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
+{
+    for (unsigned int i = 0; i < TaskList.size(); i++) {
+        // generate random size
+        int size;
+        size = std::max((int)(drand48() * 1000.0), 64);
+        TaskList[i].allocate(size, i);
+    }
 }

-int main(int argc, char **argv) {
-  // set device
-  cudaDeviceProp device_prop;
-  int dev_id = findCudaDevice(argc, (const char **)argv);
-  checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
+int main(int argc, char **argv)
+{
+    // set device
+    cudaDeviceProp device_prop;
+    int            dev_id = findCudaDevice(argc, (const char **)argv);
+    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));

-  if (!device_prop.managedMemory) {
-    // This samples requires being run on a device that supports Unified Memory
-    fprintf(stderr, "Unified Memory not supported on this device\n");
+    if (!device_prop.managedMemory) {
+        // This samples requires being run on a device that supports Unified Memory
+        fprintf(stderr, "Unified Memory not supported on this device\n");

-    exit(EXIT_WAIVED);
-  }
+        exit(EXIT_WAIVED);
+    }

-  if (device_prop.computeMode == cudaComputeModeProhibited) {
-    // This sample requires being run with a default or process exclusive mode
-    fprintf(stderr,
-            "This sample requires a device in either default or process "
-            "exclusive mode\n");
+    if (device_prop.computeMode == cudaComputeModeProhibited) {
+        // This sample requires being run with a default or process exclusive mode
+        fprintf(stderr,
+                "This sample requires a device in either default or process "
+                "exclusive mode\n");

-    exit(EXIT_WAIVED);
-  }
+        exit(EXIT_WAIVED);
+    }

-  // randomise task sizes
-  int seed = (int)time(NULL);
-  srand48(seed);
+    // randomise task sizes
+    int seed = (int)time(NULL);
+    srand48(seed);

-  // set number of threads
-  const int nthreads = 4;
+    // set number of threads
+    const int nthreads = 4;

-  // number of streams = number of threads
-  cudaStream_t *streams = new cudaStream_t[nthreads + 1];
-  cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
+    // number of streams = number of threads
+    cudaStream_t   *streams = new cudaStream_t[nthreads + 1];
+    cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];

-  for (int i = 0; i < nthreads + 1; i++) {
-    checkCudaErrors(cudaStreamCreate(&streams[i]));
-    checkCudaErrors(cublasCreate(&handles[i]));
-  }
+    for (int i = 0; i < nthreads + 1; i++) {
+        checkCudaErrors(cudaStreamCreate(&streams[i]));
+        checkCudaErrors(cublasCreate(&handles[i]));
+    }

-  // create list of N tasks
-  unsigned int N = 40;
-  std::vector<Task<double> > TaskList(N);
-  initialise_tasks(TaskList);
+    // create list of N tasks
+    unsigned int              N = 40;
+    std::vector<Task<double>> TaskList(N);
+    initialise_tasks(TaskList);

-  printf("Executing tasks on host / device\n");
+    printf("Executing tasks on host / device\n");

 // run through all tasks using threads and streams
 #ifdef USE_PTHREADS
-  pthread_t threads[nthreads];
-  threadData *InputToThreads = new threadData[nthreads];
+    pthread_t   threads[nthreads];
+    threadData *InputToThreads = new threadData[nthreads];

-  for (int i = 0; i < nthreads; i++) {
-    checkCudaErrors(cudaSetDevice(dev_id));
-    InputToThreads[i].tid = i;
-    InputToThreads[i].streams = streams;
-    InputToThreads[i].handles = handles;
+    for (int i = 0; i < nthreads; i++) {
+        checkCudaErrors(cudaSetDevice(dev_id));
+        InputToThreads[i].tid     = i;
+        InputToThreads[i].streams = streams;
+        InputToThreads[i].handles = handles;

-    if ((TaskList.size() / nthreads) == 0) {
-      InputToThreads[i].taskSize = (TaskList.size() / nthreads);
-      InputToThreads[i].TaskListPtr =
-          &TaskList[i * (TaskList.size() / nthreads)];
-    } else {
-      if (i == nthreads - 1) {
-        InputToThreads[i].taskSize =
-            (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
-        InputToThreads[i].TaskListPtr =
-            &TaskList[i * (TaskList.size() / nthreads) +
-                      (TaskList.size() % nthreads)];
-      } else {
-        InputToThreads[i].taskSize = (TaskList.size() / nthreads);
-        InputToThreads[i].TaskListPtr =
-            &TaskList[i * (TaskList.size() / nthreads)];
-      }
+        if ((TaskList.size() / nthreads) == 0) {
+            InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
+            InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
+        }
+        else {
+            if (i == nthreads - 1) {
+                InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
+                InputToThreads[i].TaskListPtr =
+                    &TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
+            }
+            else {
+                InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
+                InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
+            }
+        }
+
+        pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
+    }
+    for (int i = 0; i < nthreads; i++) {
+        pthread_join(threads[i], NULL);
    }
-
-    pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
-  }
-  for (int i = 0; i < nthreads; i++) {
-    pthread_join(threads[i], NULL);
-  }
 #else
-  omp_set_num_threads(nthreads);
+    omp_set_num_threads(nthreads);
 #pragma omp parallel for schedule(dynamic)
-  for (int i = 0; i < TaskList.size(); i++) {
-    checkCudaErrors(cudaSetDevice(dev_id));
-    int tid = omp_get_thread_num();
-    execute(TaskList[i], handles, streams, tid);
-  }
+    for (int i = 0; i < TaskList.size(); i++) {
+        checkCudaErrors(cudaSetDevice(dev_id));
+        int tid = omp_get_thread_num();
+        execute(TaskList[i], handles, streams, tid);
+    }
 #endif

-  cudaDeviceSynchronize();
+    cudaDeviceSynchronize();

-  // Destroy CUDA Streams, cuBlas handles
-  for (int i = 0; i < nthreads + 1; i++) {
-    cudaStreamDestroy(streams[i]);
-    cublasDestroy(handles[i]);
-  }
+    // Destroy CUDA Streams, cuBlas handles
+    for (int i = 0; i < nthreads + 1; i++) {
+        cudaStreamDestroy(streams[i]);
+        cublasDestroy(handles[i]);
+    }

-  // Free TaskList
-  std::vector<Task<double> >().swap(TaskList);
+    // Free TaskList
+    std::vector<Task<double>>().swap(TaskList);

-  printf("All Done!\n");
-  exit(EXIT_SUCCESS);
+    printf("All Done!\n");
+    exit(EXIT_SUCCESS);
 }
--- a/Samples/0_Introduction/asyncAPI/asyncAPI.cu
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI.cu
@ -38,105 +38,107 @@
 #include <stdio.h>

 // includes CUDA Runtime
-#include <cuda_runtime.h>
 #include <cuda_profiler_api.h>
+#include <cuda_runtime.h>

 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper utility functions
+#include <helper_functions.h> // helper utility functions

-__global__ void increment_kernel(int *g_data, int inc_value) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  g_data[idx] = g_data[idx] + inc_value;
+__global__ void increment_kernel(int *g_data, int inc_value)
+{
+    int idx     = blockIdx.x * blockDim.x + threadIdx.x;
+    g_data[idx] = g_data[idx] + inc_value;
 }

-bool correct_output(int *data, const int n, const int x) {
-  for (int i = 0; i < n; i++)
-    if (data[i] != x) {
-      printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
-      return false;
+bool correct_output(int *data, const int n, const int x)
+{
+    for (int i = 0; i < n; i++)
+        if (data[i] != x) {
+            printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
+            return false;
+        }
+
+    return true;
+}
+
+int main(int argc, char *argv[])
+{
+    int            devID;
+    cudaDeviceProp deviceProps;
+
+    printf("[%s] - Starting...\n", argv[0]);
+
+    // This will pick the best possible CUDA capable device
+    devID = findCudaDevice(argc, (const char **)argv);
+
+    // get device name
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    printf("CUDA device [%s]\n", deviceProps.name);
+
+    int n      = 16 * 1024 * 1024;
+    int nbytes = n * sizeof(int);
+    int value  = 26;
+
+    // allocate host memory
+    int *a = 0;
+    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
+    memset(a, 0, nbytes);
+
+    // allocate device memory
+    int *d_a = 0;
+    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
+    checkCudaErrors(cudaMemset(d_a, 255, nbytes));
+
+    // set kernel launch configuration
+    dim3 threads = dim3(512, 1);
+    dim3 blocks  = dim3(n / threads.x, 1);
+
+    // create cuda event handles
+    cudaEvent_t start, stop;
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+
+    StopWatchInterface *timer = NULL;
+    sdkCreateTimer(&timer);
+    sdkResetTimer(&timer);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    float gpu_time = 0.0f;
+
+    // asynchronously issue work to the GPU (all to stream 0)
+    checkCudaErrors(cudaProfilerStart());
+    sdkStartTimer(&timer);
+    cudaEventRecord(start, 0);
+    cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
+    increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
+    cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
+    cudaEventRecord(stop, 0);
+    sdkStopTimer(&timer);
+    checkCudaErrors(cudaProfilerStop());
+
+    // have CPU do some work while waiting for stage 1 to finish
+    unsigned long int counter = 0;
+
+    while (cudaEventQuery(stop) == cudaErrorNotReady) {
+        counter++;
    }

-  return true;
-}
-
-int main(int argc, char *argv[]) {
-  int devID;
-  cudaDeviceProp deviceProps;
-
-  printf("[%s] - Starting...\n", argv[0]);
-
-  // This will pick the best possible CUDA capable device
-  devID = findCudaDevice(argc, (const char **)argv);
-
-  // get device name
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-  printf("CUDA device [%s]\n", deviceProps.name);
-
-  int n = 16 * 1024 * 1024;
-  int nbytes = n * sizeof(int);
-  int value = 26;
-
-  // allocate host memory
-  int *a = 0;
-  checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
-  memset(a, 0, nbytes);
-
-  // allocate device memory
-  int *d_a = 0;
-  checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
-  checkCudaErrors(cudaMemset(d_a, 255, nbytes));
-
-  // set kernel launch configuration
-  dim3 threads = dim3(512, 1);
-  dim3 blocks = dim3(n / threads.x, 1);
-
-  // create cuda event handles
-  cudaEvent_t start, stop;
-  checkCudaErrors(cudaEventCreate(&start));
-  checkCudaErrors(cudaEventCreate(&stop));
-
-  StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
-  sdkResetTimer(&timer);
-
-  checkCudaErrors(cudaDeviceSynchronize());
-  float gpu_time = 0.0f;
-
-  // asynchronously issue work to the GPU (all to stream 0)
-  checkCudaErrors(cudaProfilerStart());
-  sdkStartTimer(&timer);
-  cudaEventRecord(start, 0);
-  cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
-  increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
-  cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
-  cudaEventRecord(stop, 0);
-  sdkStopTimer(&timer);
-  checkCudaErrors(cudaProfilerStop());
-
-  // have CPU do some work while waiting for stage 1 to finish
-  unsigned long int counter = 0;
-
-  while (cudaEventQuery(stop) == cudaErrorNotReady) {
-    counter++;
-  }
-
-  checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
-
-  // print the cpu and gpu times
-  printf("time spent executing by the GPU: %.2f\n", gpu_time);
-  printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
-  printf("CPU executed %lu iterations while waiting for GPU to finish\n",
-         counter);
-
-  // check the output for correctness
-  bool bFinalResults = correct_output(a, n, value);
-
-  // release resources
-  checkCudaErrors(cudaEventDestroy(start));
-  checkCudaErrors(cudaEventDestroy(stop));
-  checkCudaErrors(cudaFreeHost(a));
-  checkCudaErrors(cudaFree(d_a));
-
-  exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
+    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
+
+    // print the cpu and gpu times
+    printf("time spent executing by the GPU: %.2f\n", gpu_time);
+    printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
+    printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
+
+    // check the output for correctness
+    bool bFinalResults = correct_output(a, n, value);
+
+    // release resources
+    checkCudaErrors(cudaEventDestroy(start));
+    checkCudaErrors(cudaEventDestroy(stop));
+    checkCudaErrors(cudaFreeHost(a));
+    checkCudaErrors(cudaFree(d_a));
+
+    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/clock/clock.cu
+++ b/Samples/0_Introduction/clock/clock.cu
@ -48,43 +48,46 @@
 // This kernel computes a standard parallel reduction and evaluates the
 // time it takes to do that for each block. The timing results are stored
 // in device memory.
-__global__ static void timedReduction(const float *input, float *output,
-                                      clock_t *timer) {
-  // __shared__ float shared[2 * blockDim.x];
-  extern __shared__ float shared[];
+__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
+{
+    // __shared__ float shared[2 * blockDim.x];
+    extern __shared__ float shared[];

-  const int tid = threadIdx.x;
-  const int bid = blockIdx.x;
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;

-  if (tid == 0) timer[bid] = clock();
+    if (tid == 0)
+        timer[bid] = clock();

-  // Copy input.
-  shared[tid] = input[tid];
-  shared[tid + blockDim.x] = input[tid + blockDim.x];
+    // Copy input.
+    shared[tid]              = input[tid];
+    shared[tid + blockDim.x] = input[tid + blockDim.x];
+
+    // Perform reduction to find minimum.
+    for (int d = blockDim.x; d > 0; d /= 2) {
+        __syncthreads();
+
+        if (tid < d) {
+            float f0 = shared[tid];
+            float f1 = shared[tid + d];
+
+            if (f1 < f0) {
+                shared[tid] = f1;
+            }
+        }
+    }
+
+    // Write result.
+    if (tid == 0)
+        output[bid] = shared[0];

-  // Perform reduction to find minimum.
-  for (int d = blockDim.x; d > 0; d /= 2) {
    __syncthreads();

-    if (tid < d) {
-      float f0 = shared[tid];
-      float f1 = shared[tid + d];
-
-      if (f1 < f0) {
-        shared[tid] = f1;
-      }
-    }
-  }
-
-  // Write result.
-  if (tid == 0) output[bid] = shared[0];
-
-  __syncthreads();
-
-  if (tid == 0) timer[bid + gridDim.x] = clock();
+    if (tid == 0)
+        timer[bid + gridDim.x] = clock();
 }

-#define NUM_BLOCKS 64
+#define NUM_BLOCKS  64
 #define NUM_THREADS 256

 // It's interesting to change the number of blocks and the number of threads to
@ -104,50 +107,46 @@ __global__ static void timedReduction(const float *input, float *output,
 // the memory. With more than 32 the speed scales linearly.

 // Start the main CUDA Sample here
-int main(int argc, char **argv) {
-  printf("CUDA Clock sample\n");
+int main(int argc, char **argv)
+{
+    printf("CUDA Clock sample\n");

-  // This will pick the best possible CUDA capable device
-  int dev = findCudaDevice(argc, (const char **)argv);
+    // This will pick the best possible CUDA capable device
+    int dev = findCudaDevice(argc, (const char **)argv);

-  float *dinput = NULL;
-  float *doutput = NULL;
-  clock_t *dtimer = NULL;
+    float   *dinput  = NULL;
+    float   *doutput = NULL;
+    clock_t *dtimer  = NULL;

-  clock_t timer[NUM_BLOCKS * 2];
-  float input[NUM_THREADS * 2];
+    clock_t timer[NUM_BLOCKS * 2];
+    float   input[NUM_THREADS * 2];

-  for (int i = 0; i < NUM_THREADS * 2; i++) {
-    input[i] = (float)i;
-  }
+    for (int i = 0; i < NUM_THREADS * 2; i++) {
+        input[i] = (float)i;
+    }

-  checkCudaErrors(
-      cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
-  checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
-  checkCudaErrors(
-      cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
+    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
+    checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
+    checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));

-  checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2,
-                             cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));

-  timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(
-      dinput, doutput, dtimer);
+    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);

-  checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2,
-                             cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));

-  checkCudaErrors(cudaFree(dinput));
-  checkCudaErrors(cudaFree(doutput));
-  checkCudaErrors(cudaFree(dtimer));
+    checkCudaErrors(cudaFree(dinput));
+    checkCudaErrors(cudaFree(doutput));
+    checkCudaErrors(cudaFree(dtimer));

-  long double avgElapsedClocks = 0;
+    long double avgElapsedClocks = 0;

-  for (int i = 0; i < NUM_BLOCKS; i++) {
-    avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
-  }
+    for (int i = 0; i < NUM_BLOCKS; i++) {
+        avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
+    }

-  avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
-  printf("Average clocks/block = %Lf\n", avgElapsedClocks);
+    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
+    printf("Average clocks/block = %Lf\n", avgElapsedClocks);

-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/clock_nvrtc/clock.cpp
+++ b/Samples/0_Introduction/clock_nvrtc/clock.cpp
@ -34,12 +34,11 @@
 */

 // System includes
-#include <stdio.h>
-#include <stdint.h>
 #include <assert.h>
-
 #include <cuda_runtime.h>
 #include <nvrtc_helper.h>
+#include <stdint.h>
+#include <stdio.h>

 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
@ -71,64 +70,68 @@

 // Start the main CUDA Sample here

-int main(int argc, char **argv) {
-  printf("CUDA Clock sample\n");
+int main(int argc, char **argv)
+{
+    printf("CUDA Clock sample\n");

-  typedef long clock_t;
+    typedef long clock_t;

-  clock_t timer[NUM_BLOCKS * 2];
+    clock_t timer[NUM_BLOCKS * 2];

-  float input[NUM_THREADS * 2];
+    float input[NUM_THREADS * 2];

-  for (int i = 0; i < NUM_THREADS * 2; i++) {
-    input[i] = (float)i;
-  }
+    for (int i = 0; i < NUM_THREADS * 2; i++) {
+        input[i] = (float)i;
+    }

-  char *cubin, *kernel_file;
-  size_t cubinSize;
+    char  *cubin, *kernel_file;
+    size_t cubinSize;

-  kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
-  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
+    kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
+    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);

-  CUmodule module = loadCUBIN(cubin, argc, argv);
-  CUfunction kernel_addr;
+    CUmodule   module = loadCUBIN(cubin, argc, argv);
+    CUfunction kernel_addr;

-  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
+    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));

-  dim3 cudaBlockSize(NUM_THREADS, 1, 1);
-  dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
+    dim3 cudaBlockSize(NUM_THREADS, 1, 1);
+    dim3 cudaGridSize(NUM_BLOCKS, 1, 1);

-  CUdeviceptr dinput, doutput, dtimer;
-  checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
-  checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
-  checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
-  checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
+    CUdeviceptr dinput, doutput, dtimer;
+    checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
+    checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
+    checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
+    checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));

-  void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
+    void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};

-  checkCudaErrors(cuLaunchKernel(
-      kernel_addr, cudaGridSize.x, cudaGridSize.y,
-      cudaGridSize.z,                                    /* grid dim */
-      cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
-      sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */
-      &arr[0],                            /* arguments */
-      0));
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
+                                   cudaGridSize.x,
+                                   cudaGridSize.y,
+                                   cudaGridSize.z, /* grid dim */
+                                   cudaBlockSize.x,
+                                   cudaBlockSize.y,
+                                   cudaBlockSize.z, /* block dim */
+                                   sizeof(float) * 2 * NUM_THREADS,
+                                   0,       /* shared mem, stream */
+                                   &arr[0], /* arguments */
+                                   0));

-  checkCudaErrors(cuCtxSynchronize());
-  checkCudaErrors(
-      cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
-  checkCudaErrors(cuMemFree(dinput));
-  checkCudaErrors(cuMemFree(doutput));
-  checkCudaErrors(cuMemFree(dtimer));
+    checkCudaErrors(cuCtxSynchronize());
+    checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
+    checkCudaErrors(cuMemFree(dinput));
+    checkCudaErrors(cuMemFree(doutput));
+    checkCudaErrors(cuMemFree(dtimer));

-  long double avgElapsedClocks = 0;
+    long double avgElapsedClocks = 0;

-  for (int i = 0; i < NUM_BLOCKS; i++) {
-    avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
-  }
+    for (int i = 0; i < NUM_BLOCKS; i++) {
+        avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
+    }

-  avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
-  printf("Average clocks/block = %Lf\n", avgElapsedClocks);
+    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
+    printf("Average clocks/block = %Lf\n", avgElapsedClocks);

-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu
+++ b/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu
@ -37,38 +37,41 @@
 // time it takes to do that for each block. The timing results are stored
 // in device memory.

-extern "C" __global__ void timedReduction(const float *input, float *output,
-                                          clock_t *timer) {
-  // __shared__ float shared[2 * blockDim.x];
-  extern __shared__ float shared[];
+extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
+{
+    // __shared__ float shared[2 * blockDim.x];
+    extern __shared__ float shared[];

-  const int tid = threadIdx.x;
-  const int bid = blockIdx.x;
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;

-  if (tid == 0) timer[bid] = clock();
+    if (tid == 0)
+        timer[bid] = clock();

-  // Copy input.
-  shared[tid] = input[tid];
-  shared[tid + blockDim.x] = input[tid + blockDim.x];
+    // Copy input.
+    shared[tid]              = input[tid];
+    shared[tid + blockDim.x] = input[tid + blockDim.x];
+
+    // Perform reduction to find minimum.
+    for (int d = blockDim.x; d > 0; d /= 2) {
+        __syncthreads();
+
+        if (tid < d) {
+            float f0 = shared[tid];
+            float f1 = shared[tid + d];
+
+            if (f1 < f0) {
+                shared[tid] = f1;
+            }
+        }
+    }
+
+    // Write result.
+    if (tid == 0)
+        output[bid] = shared[0];

-  // Perform reduction to find minimum.
-  for (int d = blockDim.x; d > 0; d /= 2) {
    __syncthreads();

-    if (tid < d) {
-      float f0 = shared[tid];
-      float f1 = shared[tid + d];
-
-      if (f1 < f0) {
-        shared[tid] = f1;
-      }
-    }
-  }
-
-  // Write result.
-  if (tid == 0) output[bid] = shared[0];
-
-  __syncthreads();
-
-  if (tid == 0) timer[bid + gridDim.x] = clock();
+    if (tid == 0)
+        timer[bid + gridDim.x] = clock();
 }
--- a/Samples/0_Introduction/cudaOpenMP/cudaOpenMP.cu
+++ b/Samples/0_Introduction/cudaOpenMP/cudaOpenMP.cu
@ -32,128 +32,125 @@

 #include <helper_cuda.h>
 #include <omp.h>
-#include <stdio.h>  // stdio functions are used since C++ streams aren't necessarily thread safe
+#include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe

 using namespace std;

 // a simple kernel that simply increments each array element by b
-__global__ void kernelAddConstant(int *g_a, const int b) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  g_a[idx] += b;
+__global__ void kernelAddConstant(int *g_a, const int b)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    g_a[idx] += b;
 }

 // a predicate that checks whether each array element is set to its index plus b
-int correctResult(int *data, const int n, const int b) {
-  for (int i = 0; i < n; i++)
-    if (data[i] != i + b) return 0;
+int correctResult(int *data, const int n, const int b)
+{
+    for (int i = 0; i < n; i++)
+        if (data[i] != i + b)
+            return 0;

-  return 1;
+    return 1;
 }

-int main(int argc, char *argv[]) {
-  int num_gpus = 0;  // number of CUDA GPUs
+int main(int argc, char *argv[])
+{
+    int num_gpus = 0; // number of CUDA GPUs

-  printf("%s Starting...\n\n", argv[0]);
+    printf("%s Starting...\n\n", argv[0]);

-  /////////////////////////////////////////////////////////////////
-  // determine the number of CUDA capable GPUs
-  //
-  cudaGetDeviceCount(&num_gpus);
+    /////////////////////////////////////////////////////////////////
+    // determine the number of CUDA capable GPUs
+    //
+    cudaGetDeviceCount(&num_gpus);

-  if (num_gpus < 1) {
-    printf("no CUDA capable devices were detected\n");
-    return 1;
-  }
+    if (num_gpus < 1) {
+        printf("no CUDA capable devices were detected\n");
+        return 1;
+    }

-  /////////////////////////////////////////////////////////////////
-  // display CPU and GPU configuration
-  //
-  printf("number of host CPUs:\t%d\n", omp_get_num_procs());
-  printf("number of CUDA devices:\t%d\n", num_gpus);
+    /////////////////////////////////////////////////////////////////
+    // display CPU and GPU configuration
+    //
+    printf("number of host CPUs:\t%d\n", omp_get_num_procs());
+    printf("number of CUDA devices:\t%d\n", num_gpus);

-  for (int i = 0; i < num_gpus; i++) {
-    cudaDeviceProp dprop;
-    cudaGetDeviceProperties(&dprop, i);
-    printf("   %d: %s\n", i, dprop.name);
-  }
+    for (int i = 0; i < num_gpus; i++) {
+        cudaDeviceProp dprop;
+        cudaGetDeviceProperties(&dprop, i);
+        printf("   %d: %s\n", i, dprop.name);
+    }

-  printf("---------------------------\n");
+    printf("---------------------------\n");

-  /////////////////////////////////////////////////////////////////
-  // initialize data
-  //
-  unsigned int n = num_gpus * 8192;
-  unsigned int nbytes = n * sizeof(int);
-  int *a = 0;  // pointer to data on the CPU
-  int b = 3;   // value by which the array is incremented
-  a = (int *)malloc(nbytes);
+    /////////////////////////////////////////////////////////////////
+    // initialize data
+    //
+    unsigned int n      = num_gpus * 8192;
+    unsigned int nbytes = n * sizeof(int);
+    int         *a      = 0; // pointer to data on the CPU
+    int          b      = 3; // value by which the array is incremented
+    a                   = (int *)malloc(nbytes);

-  if (0 == a) {
-    printf("couldn't allocate CPU memory\n");
-    return 1;
-  }
+    if (0 == a) {
+        printf("couldn't allocate CPU memory\n");
+        return 1;
+    }

-  for (unsigned int i = 0; i < n; i++) a[i] = i;
+    for (unsigned int i = 0; i < n; i++)
+        a[i] = i;

-  ////////////////////////////////////////////////////////////////
-  // run as many CPU threads as there are CUDA devices
-  //   each CPU thread controls a different device, processing its
-  //   portion of the data.  It's possible to use more CPU threads
-  //   than there are CUDA devices, in which case several CPU
-  //   threads will be allocating resources and launching kernels
-  //   on the same device.  For example, try omp_set_num_threads(2*num_gpus);
-  //   Recall that all variables declared inside an "omp parallel" scope are
-  //   local to each CPU thread
-  //
-  omp_set_num_threads(
-      num_gpus);  // create as many CPU threads as there are CUDA devices
+    ////////////////////////////////////////////////////////////////
+    // run as many CPU threads as there are CUDA devices
+    //   each CPU thread controls a different device, processing its
+    //   portion of the data.  It's possible to use more CPU threads
+    //   than there are CUDA devices, in which case several CPU
+    //   threads will be allocating resources and launching kernels
+    //   on the same device.  For example, try omp_set_num_threads(2*num_gpus);
+    //   Recall that all variables declared inside an "omp parallel" scope are
+    //   local to each CPU thread
+    //
+    omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
 // omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
 // are CUDA devices
 #pragma omp parallel
-  {
-    unsigned int cpu_thread_id = omp_get_thread_num();
-    unsigned int num_cpu_threads = omp_get_num_threads();
+    {
+        unsigned int cpu_thread_id   = omp_get_thread_num();
+        unsigned int num_cpu_threads = omp_get_num_threads();

-    // set and check the CUDA device for this CPU thread
-    int gpu_id = -1;
-    checkCudaErrors(cudaSetDevice(
-        cpu_thread_id %
-        num_gpus));  // "% num_gpus" allows more CPU threads than GPU devices
-    checkCudaErrors(cudaGetDevice(&gpu_id));
-    printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,
-           num_cpu_threads, gpu_id);
+        // set and check the CUDA device for this CPU thread
+        int gpu_id = -1;
+        checkCudaErrors(
+            cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
+        checkCudaErrors(cudaGetDevice(&gpu_id));
+        printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);

-    int *d_a =
-        0;  // pointer to memory on the device associated with this CPU thread
-    int *sub_a =
-        a +
-        cpu_thread_id * n /
-            num_cpu_threads;  // pointer to this CPU thread's portion of data
-    unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
-    dim3 gpu_threads(128);  // 128 threads per block
-    dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
+        int         *d_a   = 0; // pointer to memory on the device associated with this CPU thread
+        int         *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
+        unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
+        dim3         gpu_threads(128); // 128 threads per block
+        dim3         gpu_blocks(n / (gpu_threads.x * num_cpu_threads));

-    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
-    checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
-    checkCudaErrors(
-        cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
-    kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
+        checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
+        checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
+        checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
+        kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);

-    checkCudaErrors(
-        cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaFree(d_a));
-  }
-  printf("---------------------------\n");
+        checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
+        checkCudaErrors(cudaFree(d_a));
+    }
+    printf("---------------------------\n");

-  if (cudaSuccess != cudaGetLastError())
-    printf("%s\n", cudaGetErrorString(cudaGetLastError()));
+    if (cudaSuccess != cudaGetLastError())
+        printf("%s\n", cudaGetErrorString(cudaGetLastError()));

-  ////////////////////////////////////////////////////////////////
-  // check the result
-  //
-  bool bResult = correctResult(a, n, b);
+    ////////////////////////////////////////////////////////////////
+    // check the result
+    //
+    bool bResult = correctResult(a, n, b);

-  if (a) free(a);  // free CPU memory
+    if (a)
+        free(a); // free CPU memory

-  exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/fp16ScalarProduct/fp16ScalarProduct.cu
+++ b/Samples/0_Introduction/fp16ScalarProduct/fp16ScalarProduct.cu
@ -25,191 +25,188 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-#include "cuda_fp16.h"
-#include "helper_cuda.h"
-
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>

-#define NUM_OF_BLOCKS 128
+#include "cuda_fp16.h"
+#include "helper_cuda.h"
+
+#define NUM_OF_BLOCKS  128
 #define NUM_OF_THREADS 128

-__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
-  if (threadIdx.x < 64)
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
-  __syncthreads();
-  if (threadIdx.x < 32)
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
-  __syncthreads();
-  if (threadIdx.x < 16)
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
-  __syncthreads();
-  if (threadIdx.x < 8)
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
-  __syncthreads();
-  if (threadIdx.x < 4)
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
-  __syncthreads();
-  if (threadIdx.x < 2)
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
-  __syncthreads();
-  if (threadIdx.x < 1)
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
-  __syncthreads();
+__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
+{
+    if (threadIdx.x < 64)
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
+    __syncthreads();
+    if (threadIdx.x < 32)
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
+    __syncthreads();
+    if (threadIdx.x < 16)
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
+    __syncthreads();
+    if (threadIdx.x < 8)
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
+    __syncthreads();
+    if (threadIdx.x < 4)
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
+    __syncthreads();
+    if (threadIdx.x < 2)
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
+    __syncthreads();
+    if (threadIdx.x < 1)
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
+    __syncthreads();
 }

-__forceinline__ __device__ void reduceInShared_native(half2 *const v) {
-  if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
-  __syncthreads();
-  if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
-  __syncthreads();
-  if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
-  __syncthreads();
-  if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
-  __syncthreads();
-  if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
-  __syncthreads();
-  if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
-  __syncthreads();
-  if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
-  __syncthreads();
+__forceinline__ __device__ void reduceInShared_native(half2 *const v)
+{
+    if (threadIdx.x < 64)
+        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
+    __syncthreads();
+    if (threadIdx.x < 32)
+        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
+    __syncthreads();
+    if (threadIdx.x < 16)
+        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
+    __syncthreads();
+    if (threadIdx.x < 8)
+        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
+    __syncthreads();
+    if (threadIdx.x < 4)
+        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
+    __syncthreads();
+    if (threadIdx.x < 2)
+        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
+    __syncthreads();
+    if (threadIdx.x < 1)
+        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
+    __syncthreads();
 }

-__global__ void scalarProductKernel_intrinsics(half2 const *const a,
-                                               half2 const *const b,
-                                               float *const results,
-                                               size_t const size) {
-  const int stride = gridDim.x * blockDim.x;
-  __shared__ half2 shArray[NUM_OF_THREADS];
+__global__ void
+scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size)
+{
+    const int        stride = gridDim.x * blockDim.x;
+    __shared__ half2 shArray[NUM_OF_THREADS];

-  shArray[threadIdx.x] = __float2half2_rn(0.f);
-  half2 value = __float2half2_rn(0.f);
+    shArray[threadIdx.x] = __float2half2_rn(0.f);
+    half2 value          = __float2half2_rn(0.f);

-  for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
-    value = __hfma2(a[i], b[i], value);
-  }
+    for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
+        value = __hfma2(a[i], b[i], value);
+    }

-  shArray[threadIdx.x] = value;
-  __syncthreads();
-  reduceInShared_intrinsics(shArray);
+    shArray[threadIdx.x] = value;
+    __syncthreads();
+    reduceInShared_intrinsics(shArray);

-  if (threadIdx.x == 0) {
-    half2 result = shArray[0];
-    float f_result = __low2float(result) + __high2float(result);
-    results[blockIdx.x] = f_result;
-  }
+    if (threadIdx.x == 0) {
+        half2 result        = shArray[0];
+        float f_result      = __low2float(result) + __high2float(result);
+        results[blockIdx.x] = f_result;
+    }
 }

-__global__ void scalarProductKernel_native(half2 const *const a,
-                                           half2 const *const b,
-                                           float *const results,
-                                           size_t const size) {
-  const int stride = gridDim.x * blockDim.x;
-  __shared__ half2 shArray[NUM_OF_THREADS];
+__global__ void
+scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size)
+{
+    const int        stride = gridDim.x * blockDim.x;
+    __shared__ half2 shArray[NUM_OF_THREADS];

-  half2 value(0.f, 0.f);
-  shArray[threadIdx.x] = value;
+    half2 value(0.f, 0.f);
+    shArray[threadIdx.x] = value;

-  for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
-    value = a[i] * b[i] + value;
-  }
+    for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
+        value = a[i] * b[i] + value;
+    }

-  shArray[threadIdx.x] = value;
-  __syncthreads();
-  reduceInShared_native(shArray);
+    shArray[threadIdx.x] = value;
+    __syncthreads();
+    reduceInShared_native(shArray);

-  if (threadIdx.x == 0) {
-    half2 result = shArray[0];
-    float f_result = (float)result.y + (float)result.x;
-    results[blockIdx.x] = f_result;
-  }
+    if (threadIdx.x == 0) {
+        half2 result        = shArray[0];
+        float f_result      = (float)result.y + (float)result.x;
+        results[blockIdx.x] = f_result;
+    }
 }

-void generateInput(half2 *a, size_t size) {
-  for (size_t i = 0; i < size; ++i) {
-    half2 temp;
-    temp.x = static_cast<float>(rand() % 4);
-    temp.y = static_cast<float>(rand() % 2);
-    a[i] = temp;
-  }
+void generateInput(half2 *a, size_t size)
+{
+    for (size_t i = 0; i < size; ++i) {
+        half2 temp;
+        temp.x = static_cast<float>(rand() % 4);
+        temp.y = static_cast<float>(rand() % 2);
+        a[i]   = temp;
+    }
 }

-int main(int argc, char *argv[]) {
-  srand((unsigned int)time(NULL));
-  size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
+int main(int argc, char *argv[])
+{
+    srand((unsigned int)time(NULL));
+    size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;

-  half2 *vec[2];
-  half2 *devVec[2];
+    half2 *vec[2];
+    half2 *devVec[2];

-  float *results;
-  float *devResults;
+    float *results;
+    float *devResults;

-  int devID = findCudaDevice(argc, (const char **)argv);
+    int devID = findCudaDevice(argc, (const char **)argv);

-  cudaDeviceProp devProp;
-  checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
+    cudaDeviceProp devProp;
+    checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));

-  if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
-    printf(
-        "ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
-        "higher.\n");
-    return EXIT_WAIVED;
-  }
+    if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
+        printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
+               "higher.\n");
+        return EXIT_WAIVED;
+    }

-  for (int i = 0; i < 2; ++i) {
-    checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
-    checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
-  }
+    for (int i = 0; i < 2; ++i) {
+        checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
+        checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
+    }

-  checkCudaErrors(
-      cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
-  checkCudaErrors(
-      cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
+    checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
+    checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));

-  for (int i = 0; i < 2; ++i) {
-    generateInput(vec[i], size);
-    checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i],
-                               cudaMemcpyHostToDevice));
-  }
+    for (int i = 0; i < 2; ++i) {
+        generateInput(vec[i], size);
+        checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice));
+    }

-  scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
-      devVec[0], devVec[1], devResults, size);
+    scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);

-  checkCudaErrors(cudaMemcpy(results, devResults,
-                             NUM_OF_BLOCKS * sizeof *results,
-                             cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));

-  float result_native = 0;
-  for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
-    result_native += results[i];
-  }
-  printf("Result native operators\t: %f \n", result_native);
+    float result_native = 0;
+    for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
+        result_native += results[i];
+    }
+    printf("Result native operators\t: %f \n", result_native);

-  scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
-      devVec[0], devVec[1], devResults, size);
+    scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);

-  checkCudaErrors(cudaMemcpy(results, devResults,
-                             NUM_OF_BLOCKS * sizeof *results,
-                             cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));

-  float result_intrinsics = 0;
-  for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
-    result_intrinsics += results[i];
-  }
-  printf("Result intrinsics\t: %f \n", result_intrinsics);
+    float result_intrinsics = 0;
+    for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
+        result_intrinsics += results[i];
+    }
+    printf("Result intrinsics\t: %f \n", result_intrinsics);

-  printf("&&&& fp16ScalarProduct %s\n",
-         (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED"
-                                                             : "FAILED");
+    printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED");

-  for (int i = 0; i < 2; ++i) {
-    checkCudaErrors(cudaFree(devVec[i]));
-    checkCudaErrors(cudaFreeHost(vec[i]));
-  }
+    for (int i = 0; i < 2; ++i) {
+        checkCudaErrors(cudaFree(devVec[i]));
+        checkCudaErrors(cudaFreeHost(vec[i]));
+    }

-  checkCudaErrors(cudaFree(devResults));
-  checkCudaErrors(cudaFreeHost(results));
+    checkCudaErrors(cudaFree(devResults));
+    checkCudaErrors(cudaFreeHost(results));

-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/matrixMul/matrixMul.cu
+++ b/Samples/0_Introduction/matrixMul/matrixMul.cu
@ -40,314 +40,303 @@
 */

 // System includes
-#include <stdio.h>
 #include <assert.h>
+#include <stdio.h>

 // CUDA runtime
-#include <cuda_runtime.h>
 #include <cuda_profiler_api.h>
+#include <cuda_runtime.h>

 // Helper functions and utilities to work with CUDA
-#include <helper_functions.h>
 #include <helper_cuda.h>
+#include <helper_functions.h>

 /**
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 * wA is A's width and wB is B's width
 */
-template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
-    float *B, int wA,
-    int wB) {
-  // Block index
-  int bx = blockIdx.x;
-  int by = blockIdx.y;
+template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
+{
+    // Block index
+    int bx = blockIdx.x;
+    int by = blockIdx.y;

-  // Thread index
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;

-  // Index of the first sub-matrix of A processed by the block
-  int aBegin = wA * BLOCK_SIZE * by;
+    // Index of the first sub-matrix of A processed by the block
+    int aBegin = wA * BLOCK_SIZE * by;

-  // Index of the last sub-matrix of A processed by the block
-  int aEnd   = aBegin + wA - 1;
+    // Index of the last sub-matrix of A processed by the block
+    int aEnd = aBegin + wA - 1;

-  // Step size used to iterate through the sub-matrices of A
-  int aStep  = BLOCK_SIZE;
+    // Step size used to iterate through the sub-matrices of A
+    int aStep = BLOCK_SIZE;

-  // Index of the first sub-matrix of B processed by the block
-  int bBegin = BLOCK_SIZE * bx;
+    // Index of the first sub-matrix of B processed by the block
+    int bBegin = BLOCK_SIZE * bx;

-  // Step size used to iterate through the sub-matrices of B
-  int bStep  = BLOCK_SIZE * wB;
+    // Step size used to iterate through the sub-matrices of B
+    int bStep = BLOCK_SIZE * wB;

-  // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
-  float Csub = 0;
+    // Csub is used to store the element of the block sub-matrix
+    // that is computed by the thread
+    float Csub = 0;

-  // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
-  for (int a = aBegin, b = bBegin;
-       a <= aEnd;
-       a += aStep, b += bStep) {
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
-    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+        // Declaration of the shared memory array As used to
+        // store the sub-matrix of A
+        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
-    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+        // Declaration of the shared memory array Bs used to
+        // store the sub-matrix of B
+        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

-    // Load the matrices from device memory
-    // to shared memory; each thread loads
-    // one element of each matrix
-    As[ty][tx] = A[a + wA * ty + tx];
-    Bs[ty][tx] = B[b + wB * ty + tx];
+        // Load the matrices from device memory
+        // to shared memory; each thread loads
+        // one element of each matrix
+        As[ty][tx] = A[a + wA * ty + tx];
+        Bs[ty][tx] = B[b + wB * ty + tx];

-    // Synchronize to make sure the matrices are loaded
-    __syncthreads();
+        // Synchronize to make sure the matrices are loaded
+        __syncthreads();

-    // Multiply the two matrices together;
-    // each thread computes one element
-    // of the block sub-matrix
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
 #pragma unroll

-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Csub += As[ty][k] * Bs[k][tx];
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+            Csub += As[ty][k] * Bs[k][tx];
+        }
+
+        // Synchronize to make sure that the preceding
+        // computation is done before loading two new
+        // sub-matrices of A and B in the next iteration
+        __syncthreads();
    }

-    // Synchronize to make sure that the preceding
-    // computation is done before loading two new
-    // sub-matrices of A and B in the next iteration
-    __syncthreads();
-  }
-
-  // Write the block sub-matrix to device memory;
-  // each thread writes one element
-  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
-  C[c + wB * ty + tx] = Csub;
+    // Write the block sub-matrix to device memory;
+    // each thread writes one element
+    int c               = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
+    C[c + wB * ty + tx] = Csub;
 }

-void ConstantInit(float *data, int size, float val) {
-  for (int i = 0; i < size; ++i) {
-    data[i] = val;
-  }
+void ConstantInit(float *data, int size, float val)
+{
+    for (int i = 0; i < size; ++i) {
+        data[i] = val;
+    }
 }

 /**
 * Run a simple test of matrix multiplication using CUDA
 */
-int MatrixMultiply(int argc, char **argv,
-                   int block_size, const dim3 &dimsA,
-                   const dim3 &dimsB) {
-  // Allocate host memory for matrices A and B
-  unsigned int size_A = dimsA.x * dimsA.y;
-  unsigned int mem_size_A = sizeof(float) * size_A;
-  float *h_A;
-  checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
-  unsigned int size_B = dimsB.x * dimsB.y;
-  unsigned int mem_size_B = sizeof(float) * size_B;
-  float *h_B;
-  checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
-  cudaStream_t stream;
+int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB)
+{
+    // Allocate host memory for matrices A and B
+    unsigned int size_A     = dimsA.x * dimsA.y;
+    unsigned int mem_size_A = sizeof(float) * size_A;
+    float       *h_A;
+    checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
+    unsigned int size_B     = dimsB.x * dimsB.y;
+    unsigned int mem_size_B = sizeof(float) * size_B;
+    float       *h_B;
+    checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
+    cudaStream_t stream;

-  // Initialize host memory
-  const float valB = 0.01f;
-  ConstantInit(h_A, size_A, 1.0f);
-  ConstantInit(h_B, size_B, valB);
+    // Initialize host memory
+    const float valB = 0.01f;
+    ConstantInit(h_A, size_A, 1.0f);
+    ConstantInit(h_B, size_B, valB);

-  // Allocate device memory
-  float *d_A, *d_B, *d_C;
+    // Allocate device memory
+    float *d_A, *d_B, *d_C;

-  // Allocate host matrix C
-  dim3 dimsC(dimsB.x, dimsA.y, 1);
-  unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
-  float *h_C;
-  checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
+    // Allocate host matrix C
+    dim3         dimsC(dimsB.x, dimsA.y, 1);
+    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
+    float       *h_C;
+    checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));

-  if (h_C == NULL) {
-    fprintf(stderr, "Failed to allocate host matrix C!\n");
-    exit(EXIT_FAILURE);
-  }
+    if (h_C == NULL) {
+        fprintf(stderr, "Failed to allocate host matrix C!\n");
+        exit(EXIT_FAILURE);
+    }

-  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
-  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
-  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
-  // Allocate CUDA events that we'll use for timing
-  cudaEvent_t start, stop;
-  checkCudaErrors(cudaEventCreate(&start));
-  checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
+    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
+    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
+    // Allocate CUDA events that we'll use for timing
+    cudaEvent_t start, stop;
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));

-  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

-  // copy host memory to device
-  checkCudaErrors(
-      cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(
-      cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
+    // copy host memory to device
+    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));

-  // Setup execution parameters
-  dim3 threads(block_size, block_size);
-  dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
+    // Setup execution parameters
+    dim3 threads(block_size, block_size);
+    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);

-  // Create and start timer
-  printf("Computing result using CUDA Kernel...\n");
+    // Create and start timer
+    printf("Computing result using CUDA Kernel...\n");

-  // Performs warmup operation using matrixMul CUDA kernel
-  if (block_size == 16) {
-    MatrixMulCUDA<16>
-        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-  } else {
-    MatrixMulCUDA<32>
-        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-  }
-
-  printf("done\n");
-  checkCudaErrors(cudaStreamSynchronize(stream));
-
-  // Record the start event
-  checkCudaErrors(cudaEventRecord(start, stream));
-
-  // Execute the kernel
-  int nIter = 300;
-
-  for (int j = 0; j < nIter; j++) {
+    // Performs warmup operation using matrixMul CUDA kernel
    if (block_size == 16) {
-      MatrixMulCUDA<16>
-          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-    } else {
-      MatrixMulCUDA<32>
-          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+        MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
    }
-  }
-
-  // Record the stop event
-  checkCudaErrors(cudaEventRecord(stop, stream));
-
-  // Wait for the stop event to complete
-  checkCudaErrors(cudaEventSynchronize(stop));
-
-  float msecTotal = 0.0f;
-  checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
-
-  // Compute and print the performance
-  float msecPerMatrixMul = msecTotal / nIter;
-  double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
-                             static_cast<double>(dimsA.y) *
-                             static_cast<double>(dimsB.x);
-  double gigaFlops =
-      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
-  printf(
-      "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
-      " WorkgroupSize= %u threads/block\n",
-      gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
-
-  // Copy result from device to host
-  checkCudaErrors(
-      cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
-  checkCudaErrors(cudaStreamSynchronize(stream));
-
-  printf("Checking computed result for correctness: ");
-  bool correct = true;
-
-  // test relative error by the formula
-  //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
-  double eps = 1.e-6;  // machine zero
-
-  for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
-    double abs_err = fabs(h_C[i] - (dimsA.x * valB));
-    double dot_length = dimsA.x;
-    double abs_val = fabs(h_C[i]);
-    double rel_err = abs_err / abs_val / dot_length;
-
-    if (rel_err > eps) {
-      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
-             i, h_C[i], dimsA.x * valB, eps);
-      correct = false;
+    else {
+        MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
    }
-  }

-  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
+    printf("done\n");
+    checkCudaErrors(cudaStreamSynchronize(stream));

-  // Clean up memory
-  checkCudaErrors(cudaFreeHost(h_A));
-  checkCudaErrors(cudaFreeHost(h_B));
-  checkCudaErrors(cudaFreeHost(h_C));
-  checkCudaErrors(cudaFree(d_A));
-  checkCudaErrors(cudaFree(d_B));
-  checkCudaErrors(cudaFree(d_C));
-  checkCudaErrors(cudaEventDestroy(start));
-  checkCudaErrors(cudaEventDestroy(stop));
-  printf(
-      "\nNOTE: The CUDA Samples are not meant for performance "
-      "measurements. Results may vary when GPU Boost is enabled.\n");
+    // Record the start event
+    checkCudaErrors(cudaEventRecord(start, stream));

-  if (correct) {
-    return EXIT_SUCCESS;
-  } else {
-    return EXIT_FAILURE;
-  }
+    // Execute the kernel
+    int nIter = 300;
+
+    for (int j = 0; j < nIter; j++) {
+        if (block_size == 16) {
+            MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+        }
+        else {
+            MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+        }
+    }
+
+    // Record the stop event
+    checkCudaErrors(cudaEventRecord(stop, stream));
+
+    // Wait for the stop event to complete
+    checkCudaErrors(cudaEventSynchronize(stop));
+
+    float msecTotal = 0.0f;
+    checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
+
+    // Compute and print the performance
+    float  msecPerMatrixMul = msecTotal / nIter;
+    double flopsPerMatrixMul =
+        2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
+    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
+    printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
+           " WorkgroupSize= %u threads/block\n",
+           gigaFlops,
+           msecPerMatrixMul,
+           flopsPerMatrixMul,
+           threads.x * threads.y);
+
+    // Copy result from device to host
+    checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));
+
+    printf("Checking computed result for correctness: ");
+    bool correct = true;
+
+    // test relative error by the formula
+    //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
+    double eps = 1.e-6; // machine zero
+
+    for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
+        double abs_err    = fabs(h_C[i] - (dimsA.x * valB));
+        double dot_length = dimsA.x;
+        double abs_val    = fabs(h_C[i]);
+        double rel_err    = abs_err / abs_val / dot_length;
+
+        if (rel_err > eps) {
+            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
+            correct = false;
+        }
+    }
+
+    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
+
+    // Clean up memory
+    checkCudaErrors(cudaFreeHost(h_A));
+    checkCudaErrors(cudaFreeHost(h_B));
+    checkCudaErrors(cudaFreeHost(h_C));
+    checkCudaErrors(cudaFree(d_A));
+    checkCudaErrors(cudaFree(d_B));
+    checkCudaErrors(cudaFree(d_C));
+    checkCudaErrors(cudaEventDestroy(start));
+    checkCudaErrors(cudaEventDestroy(stop));
+    printf("\nNOTE: The CUDA Samples are not meant for performance "
+           "measurements. Results may vary when GPU Boost is enabled.\n");
+
+    if (correct) {
+        return EXIT_SUCCESS;
+    }
+    else {
+        return EXIT_FAILURE;
+    }
 }


 /**
 * Program main
 */
-int main(int argc, char **argv) {
-  printf("[Matrix Multiply Using CUDA] - Starting...\n");
+int main(int argc, char **argv)
+{
+    printf("[Matrix Multiply Using CUDA] - Starting...\n");

-  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
-      checkCmdLineFlag(argc, (const char **)argv, "?")) {
-    printf("Usage -device=n (n >= 0 for deviceID)\n");
-    printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
-    printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
-    printf("  Note: Outer matrix dimensions of A & B matrices" \
-           " must be equal.\n");
+    if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
+        printf("Usage -device=n (n >= 0 for deviceID)\n");
+        printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
+        printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
+        printf("  Note: Outer matrix dimensions of A & B matrices"
+               " must be equal.\n");

-    exit(EXIT_SUCCESS);
-  }
+        exit(EXIT_SUCCESS);
+    }

-  // This will pick the best possible CUDA capable device, otherwise
-  // override the device ID based on input provided at the command line
-  int dev = findCudaDevice(argc, (const char **)argv);
+    // This will pick the best possible CUDA capable device, otherwise
+    // override the device ID based on input provided at the command line
+    int dev = findCudaDevice(argc, (const char **)argv);

-  int block_size = 32;
+    int block_size = 32;

-  dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
-  dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
+    dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
+    dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);

-  // width of Matrix A
-  if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
-    dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
-  }
+    // width of Matrix A
+    if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
+        dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
+    }

-  // height of Matrix A
-  if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
-    dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
-  }
+    // height of Matrix A
+    if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
+        dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
+    }

-  // width of Matrix B
-  if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
-    dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
-  }
+    // width of Matrix B
+    if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
+        dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
+    }

-  // height of Matrix B
-  if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
-    dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
-  }
+    // height of Matrix B
+    if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
+        dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
+    }

-  if (dimsA.x != dimsB.y) {
-    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
-           dimsA.x, dimsB.y);
-    exit(EXIT_FAILURE);
-  }
+    if (dimsA.x != dimsB.y) {
+        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
+        exit(EXIT_FAILURE);
+    }

-  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
-         dimsB.x, dimsB.y);
+    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);

-  checkCudaErrors(cudaProfilerStart());
-  int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
-  checkCudaErrors(cudaProfilerStop());
+    checkCudaErrors(cudaProfilerStart());
+    int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
+    checkCudaErrors(cudaProfilerStop());

-  exit(matrix_result);
+    exit(matrix_result);
 }
--- a/Samples/0_Introduction/matrixMulDrv/matrixMul.h
+++ b/Samples/0_Introduction/matrixMulDrv/matrixMul.h
@ -30,11 +30,11 @@

 // Matrix dimensions
 // (chosen as multiples of the thread block size for simplicity)
-#define WA (4 * block_size)  // Matrix A width
-#define HA (6 * block_size)  // Matrix A height
-#define WB (4 * block_size)  // Matrix B width
-#define HB WA                // Matrix B height
-#define WC WB                // Matrix C width
-#define HC HA                // Matrix C height
+#define WA (4 * block_size) // Matrix A width
+#define HA (6 * block_size) // Matrix A height
+#define WB (4 * block_size) // Matrix B width
+#define HB WA               // Matrix B height
+#define WC WB               // Matrix C width
+#define HC HA               // Matrix C height

-#endif  // _MATRIXMUL_H_
+#endif // _MATRIXMUL_H_
--- a/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp
+++ b/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp
@ -46,23 +46,23 @@

 // includes, system
 #include <builtin_types.h>
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <iostream>
 #include <cstring>
+#include <iostream>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 // includes, project, CUDA
+#include <cstring>
 #include <cuda.h>
 #include <helper_cuda_drvapi.h>
 #include <helper_image.h>
 #include <helper_string.h>
 #include <helper_timer.h>
-
-#include <cstring>
 #include <iostream>
 #include <string>
+
 #include "matrixMul.h"


@ -71,11 +71,9 @@
 void runTest(int argc, char **argv);
 void randomInit(float *, int);

-extern "C" void computeGold(float *, const float *, const float *, unsigned int,
-                            unsigned int, unsigned int);
+extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);

-static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
-                    int *blk_size);
+static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size);

 #ifndef FATBIN_FILE
 #define FATBIN_FILE "matrixMul_kernel64.fatbin"
@ -84,237 +82,252 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
 ////////////////////////////////////////////////////////////////////////////////
 // Globals
 ////////////////////////////////////////////////////////////////////////////////
-CUdevice cuDevice;
+CUdevice  cuDevice;
 CUcontext cuContext;
-CUmodule cuModule;
-size_t totalGlobalMem;
+CUmodule  cuModule;
+size_t    totalGlobalMem;

 const char *sSDKsample = "matrixMulDrv (Driver API)";

-void constantInit(float *data, int size, float val) {
-  for (int i = 0; i < size; ++i) {
-    data[i] = val;
-  }
+void constantInit(float *data, int size, float val)
+{
+    for (int i = 0; i < size; ++i) {
+        data[i] = val;
+    }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  printf("[ %s ]\n", sSDKsample);
+int main(int argc, char **argv)
+{
+    printf("[ %s ]\n", sSDKsample);

-  runTest(argc, argv);
+    runTest(argc, argv);
 }

 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
-  // initialize CUDA
-  CUfunction matrixMul = NULL;
-  int block_size = 0;
+void runTest(int argc, char **argv)
+{
+    // initialize CUDA
+    CUfunction matrixMul  = NULL;
+    int        block_size = 0;

-  initCUDA(argc, argv, &matrixMul, &block_size);
+    initCUDA(argc, argv, &matrixMul, &block_size);

-  // set seed for rand()
-  srand(2006);
+    // set seed for rand()
+    srand(2006);

-  // allocate host memory for matrices A and B
-  unsigned int size_A = WA * HA;
-  unsigned int mem_size_A = sizeof(float) * size_A;
-  float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
-  unsigned int size_B = WB * HB;
-  unsigned int mem_size_B = sizeof(float) * size_B;
-  float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
+    // allocate host memory for matrices A and B
+    unsigned int size_A     = WA * HA;
+    unsigned int mem_size_A = sizeof(float) * size_A;
+    float       *h_A        = reinterpret_cast<float *>(malloc(mem_size_A));
+    unsigned int size_B     = WB * HB;
+    unsigned int mem_size_B = sizeof(float) * size_B;
+    float       *h_B        = reinterpret_cast<float *>(malloc(mem_size_B));

-  // initialize host memory
-  const float valB = 0.01f;
-  constantInit(h_A, size_A, 1.0f);
-  constantInit(h_B, size_B, valB);
+    // initialize host memory
+    const float valB = 0.01f;
+    constantInit(h_A, size_A, 1.0f);
+    constantInit(h_B, size_B, valB);

-  // allocate device memory
-  CUdeviceptr d_A;
-  checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
-  CUdeviceptr d_B;
-  checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
+    // allocate device memory
+    CUdeviceptr d_A;
+    checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
+    CUdeviceptr d_B;
+    checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));

-  // copy host memory to device
-  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
-  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
+    // copy host memory to device
+    checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
+    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));

-  // allocate device memory for result
-  size_t size_C = WC * HC;
-  size_t mem_size_C = sizeof(float) * size_C;
+    // allocate device memory for result
+    size_t size_C     = WC * HC;
+    size_t mem_size_C = sizeof(float) * size_C;

-  CUdeviceptr d_C;
-  checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
+    CUdeviceptr d_C;
+    checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));

-  // allocate mem for the result on host side
-  float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
+    // allocate mem for the result on host side
+    float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));

-  // create and start timer
-  StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
+    // create and start timer
+    StopWatchInterface *timer = NULL;
+    sdkCreateTimer(&timer);

-  // start the timer
-  sdkStartTimer(&timer);
+    // start the timer
+    sdkStartTimer(&timer);

-  // There are two ways to launch CUDA kernels via the Driver API.
-  // In this CUDA Sample, we illustrate both ways to pass parameters
-  // and specify parameters.  By default we use the simpler method.
-  dim3 block(block_size, block_size, 1);
-  dim3 grid(WC / block_size, HC / block_size, 1);
+    // There are two ways to launch CUDA kernels via the Driver API.
+    // In this CUDA Sample, we illustrate both ways to pass parameters
+    // and specify parameters.  By default we use the simpler method.
+    dim3 block(block_size, block_size, 1);
+    dim3 grid(WC / block_size, HC / block_size, 1);

-  if (1) {
-    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
-    // Launching (simplier method)
-    size_t Matrix_Width_A = (size_t)WA;
-    size_t Matrix_Width_B = (size_t)WB;
-    void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
-    // new CUDA 4.0 Driver API Kernel launch call
-    checkCudaErrors(cuLaunchKernel(
-        matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
-        2 * block_size * block_size * sizeof(float), NULL, args, NULL));
-  } else {
-    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
-    // Launching (advanced method)
-    int offset = 0;
-    char argBuffer[256];
-
-    // pass in launch parameters (not actually de-referencing CUdeviceptr).
-    // CUdeviceptr is storing the value of the parameters
-    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
-    offset += sizeof(d_C);
-    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
-    offset += sizeof(d_A);
-    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
-    offset += sizeof(d_B);
-
-    size_t Matrix_Width_A = (size_t)WA;
-    size_t Matrix_Width_B = (size_t)WB;
-
-    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
-    offset += sizeof(Matrix_Width_A);
-    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
-    offset += sizeof(Matrix_Width_B);
-
-    void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
-                                     CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
-                                     CU_LAUNCH_PARAM_END};
-
-    // new CUDA 4.0 Driver API Kernel launch call
-    checkCudaErrors(cuLaunchKernel(
-        matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
-        2 * block_size * block_size * sizeof(float), NULL, NULL,
-        reinterpret_cast<void **>(&kernel_launch_config)));
-  }
-
-  // copy result from device to host
-  checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
-
-  // stop and destroy timer
-  sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  sdkDeleteTimer(&timer);
-
-  printf("Checking computed result for correctness: ");
-  bool correct = true;
-
-  for (int i = 0; i < static_cast<int>(WC * HC); i++) {
-    if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
-      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i,
-             h_C[i], WA * valB);
-      correct = false;
+    if (1) {
+        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
+        // Launching (simplier method)
+        size_t Matrix_Width_A = (size_t)WA;
+        size_t Matrix_Width_B = (size_t)WB;
+        void  *args[5]        = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
+        // new CUDA 4.0 Driver API Kernel launch call
+        checkCudaErrors(cuLaunchKernel(matrixMul,
+                                       grid.x,
+                                       grid.y,
+                                       grid.z,
+                                       block.x,
+                                       block.y,
+                                       block.z,
+                                       2 * block_size * block_size * sizeof(float),
+                                       NULL,
+                                       args,
+                                       NULL));
    }
-  }
+    else {
+        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
+        // Launching (advanced method)
+        int  offset = 0;
+        char argBuffer[256];

-  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
+        // pass in launch parameters (not actually de-referencing CUdeviceptr).
+        // CUdeviceptr is storing the value of the parameters
+        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
+        offset += sizeof(d_C);
+        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
+        offset += sizeof(d_A);
+        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
+        offset += sizeof(d_B);

-  printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
-         "Results may vary when GPU Boost is enabled.\n");
+        size_t Matrix_Width_A = (size_t)WA;
+        size_t Matrix_Width_B = (size_t)WB;

-  // clean up memory
-  free(h_A);
-  free(h_B);
-  free(h_C);
-  checkCudaErrors(cuMemFree(d_A));
-  checkCudaErrors(cuMemFree(d_B));
-  checkCudaErrors(cuMemFree(d_C));
-  checkCudaErrors(cuCtxDestroy(cuContext));
+        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
+        offset += sizeof(Matrix_Width_A);
+        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
+        offset += sizeof(Matrix_Width_B);
+
+        void *kernel_launch_config[5] = {
+            CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
+
+        // new CUDA 4.0 Driver API Kernel launch call
+        checkCudaErrors(cuLaunchKernel(matrixMul,
+                                       grid.x,
+                                       grid.y,
+                                       grid.z,
+                                       block.x,
+                                       block.y,
+                                       block.z,
+                                       2 * block_size * block_size * sizeof(float),
+                                       NULL,
+                                       NULL,
+                                       reinterpret_cast<void **>(&kernel_launch_config)));
+    }
+
+    // copy result from device to host
+    checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
+
+    // stop and destroy timer
+    sdkStopTimer(&timer);
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    sdkDeleteTimer(&timer);
+
+    printf("Checking computed result for correctness: ");
+    bool correct = true;
+
+    for (int i = 0; i < static_cast<int>(WC * HC); i++) {
+        if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
+            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB);
+            correct = false;
+        }
+    }
+
+    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
+
+    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
+           "Results may vary when GPU Boost is enabled.\n");
+
+    // clean up memory
+    free(h_A);
+    free(h_B);
+    free(h_C);
+    checkCudaErrors(cuMemFree(d_A));
+    checkCudaErrors(cuMemFree(d_B));
+    checkCudaErrors(cuMemFree(d_C));
+    checkCudaErrors(cuCtxDestroy(cuContext));
 }

 // Allocates a matrix with random float entries.
-void randomInit(float *data, int size) {
-  for (int i = 0; i < size; ++i) {
-    data[i] = rand() / static_cast<float>(RAND_MAX);
-  }
-}
-
-static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
-                    int *blk_size) {
-  CUfunction cuFunction = 0;
-  int major = 0, minor = 0;
-  char deviceName[100];
-
-  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
-
-  // get compute capabilities and the devicename
-  checkCudaErrors(cuDeviceGetAttribute(
-      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
-  checkCudaErrors(cuDeviceGetAttribute(
-      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
-  checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
-  printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
-
-  checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
-  printf("  Total amount of global memory:     %llu bytes\n",
-         (long long unsigned int)totalGlobalMem);
-
-  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
-
-  // first search for the module path before we load the results
-  std::string module_path;
-  std::ostringstream fatbin;
-
-  if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
-    exit(EXIT_FAILURE);
-  } else {
-    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
-  }
-
-  if (!fatbin.str().size()) {
-    printf("fatbin file empty. exiting..\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Create module from binary file (FATBIN)
-  checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
-
-  // select the suitable kernel function
-  const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
-                           "matrixMul_bs8_64bit"};
-
-  int idx = 0;
-  int block_size = 32;
-  while (idx < 3) {
-    int threadsPerBlock = 0;
-    int blocksPerGrid = 0;
-
-    checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
-    checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
-        &blocksPerGrid, &threadsPerBlock, cuFunction, 0,
-        2 * block_size * block_size * sizeof(float), 0));
-    if (block_size * block_size <= threadsPerBlock) {
-      printf("> %d block size selected\n", block_size);
-      break;
-    } else {
-      block_size /= 2;
+void randomInit(float *data, int size)
+{
+    for (int i = 0; i < size; ++i) {
+        data[i] = rand() / static_cast<float>(RAND_MAX);
    }
-    idx++;
-  }
-
-  *pMatrixMul = cuFunction;
-  *blk_size = block_size;
-
-  return 0;
+}
+
+static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size)
+{
+    CUfunction cuFunction = 0;
+    int        major = 0, minor = 0;
+    char       deviceName[100];
+
+    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
+
+    // get compute capabilities and the devicename
+    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+    checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
+    printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
+
+    checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
+    printf("  Total amount of global memory:     %llu bytes\n", (long long unsigned int)totalGlobalMem);
+
+    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
+
+    // first search for the module path before we load the results
+    std::string        module_path;
+    std::ostringstream fatbin;
+
+    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
+        exit(EXIT_FAILURE);
+    }
+    else {
+        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
+    }
+
+    if (!fatbin.str().size()) {
+        printf("fatbin file empty. exiting..\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Create module from binary file (FATBIN)
+    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
+
+    // select the suitable kernel function
+    const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"};
+
+    int idx        = 0;
+    int block_size = 32;
+    while (idx < 3) {
+        int threadsPerBlock = 0;
+        int blocksPerGrid   = 0;
+
+        checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
+        checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
+            &blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0));
+        if (block_size * block_size <= threadsPerBlock) {
+            printf("> %d block size selected\n", block_size);
+            break;
+        }
+        else {
+            block_size /= 2;
+        }
+        idx++;
+    }
+
+    *pMatrixMul = cuFunction;
+    *blk_size   = block_size;
+
+    return 0;
 }
--- a/Samples/0_Introduction/matrixMulDrv/matrixMul_kernel.cu
+++ b/Samples/0_Introduction/matrixMulDrv/matrixMul_kernel.cu
@ -42,86 +42,87 @@
 //! wA is A's width and wB is B's width
 ////////////////////////////////////////////////////////////////////////////////
 template <int block_size, typename size_type>
-__device__ void matrixMul(float *C, float *A, float *B, size_type wA,
-                          size_type wB) {
-  // Block index
-  size_type bx = blockIdx.x;
-  size_type by = blockIdx.y;
+__device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
+{
+    // Block index
+    size_type bx = blockIdx.x;
+    size_type by = blockIdx.y;

-  // Thread index
-  size_type tx = threadIdx.x;
-  size_type ty = threadIdx.y;
+    // Thread index
+    size_type tx = threadIdx.x;
+    size_type ty = threadIdx.y;

-  // Index of the first sub-matrix of A processed by the block
-  size_type aBegin = wA * block_size * by;
+    // Index of the first sub-matrix of A processed by the block
+    size_type aBegin = wA * block_size * by;

-  // Index of the last sub-matrix of A processed by the block
-  size_type aEnd = aBegin + wA - 1;
+    // Index of the last sub-matrix of A processed by the block
+    size_type aEnd = aBegin + wA - 1;

-  // Step size used to iterate through the sub-matrices of A
-  size_type aStep = block_size;
+    // Step size used to iterate through the sub-matrices of A
+    size_type aStep = block_size;

-  // Index of the first sub-matrix of B processed by the block
-  size_type bBegin = block_size * bx;
+    // Index of the first sub-matrix of B processed by the block
+    size_type bBegin = block_size * bx;

-  // Step size used to iterate through the sub-matrices of B
-  size_type bStep = block_size * wB;
+    // Step size used to iterate through the sub-matrices of B
+    size_type bStep = block_size * wB;

-  // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
-  float Csub = 0;
+    // Csub is used to store the element of the block sub-matrix
+    // that is computed by the thread
+    float Csub = 0;

-  // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
-  for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
-    __shared__ float As[block_size][block_size];
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+        // Declaration of the shared memory array As used to
+        // store the sub-matrix of A
+        __shared__ float As[block_size][block_size];

-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
-    __shared__ float Bs[block_size][block_size];
+        // Declaration of the shared memory array Bs used to
+        // store the sub-matrix of B
+        __shared__ float Bs[block_size][block_size];

-    // Load the matrices from device memory
-    // to shared memory; each thread loads
-    // one element of each matrix
-    AS(ty, tx) = A[a + wA * ty + tx];
-    BS(ty, tx) = B[b + wB * ty + tx];
+        // Load the matrices from device memory
+        // to shared memory; each thread loads
+        // one element of each matrix
+        AS(ty, tx) = A[a + wA * ty + tx];
+        BS(ty, tx) = B[b + wB * ty + tx];

-    // Synchronize to make sure the matrices are loaded
-    __syncthreads();
+        // Synchronize to make sure the matrices are loaded
+        __syncthreads();

-    // Multiply the two matrices together;
-    // each thread computes one element
-    // of the block sub-matrix
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
 #pragma unroll

-    for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);
+        for (size_type k = 0; k < block_size; ++k)
+            Csub += AS(ty, k) * BS(k, tx);

-    // Synchronize to make sure that the preceding
-    // computation is done before loading two new
-    // sub-matrices of A and B in the next iteration
-    __syncthreads();
-  }
+        // Synchronize to make sure that the preceding
+        // computation is done before loading two new
+        // sub-matrices of A and B in the next iteration
+        __syncthreads();
+    }

-  // Write the block sub-matrix to device memory;
-  // each thread writes one element
-  size_type c = wB * block_size * by + block_size * bx;
-  C[c + wB * ty + tx] = Csub;
+    // Write the block sub-matrix to device memory;
+    // each thread writes one element
+    size_type c         = wB * block_size * by + block_size * bx;
+    C[c + wB * ty + tx] = Csub;
 }

 // C wrappers around our template kernel
-extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B,
-                                               size_t wA, size_t wB) {
-  matrixMul<8, size_t>(C, A, B, wA, wB);
+extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
+{
+    matrixMul<8, size_t>(C, A, B, wA, wB);
 }
-extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
-                                                size_t wA, size_t wB) {
-  matrixMul<16, size_t>(C, A, B, wA, wB);
+extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
+{
+    matrixMul<16, size_t>(C, A, B, wA, wB);
 }
-extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
-                                                size_t wA, size_t wB) {
-  matrixMul<32, size_t>(C, A, B, wA, wB);
+extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
+{
+    matrixMul<32, size_t>(C, A, B, wA, wB);
 }

-#endif  // #ifndef _MATRIXMUL_KERNEL_H_
+#endif // #ifndef _MATRIXMUL_KERNEL_H_
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c
@ -15,210 +15,211 @@

 // With these flags defined, this source file will dynamically
 // load the corresponding functions.  Disabled by default.
-//#define CUDA_INIT_D3D9
-//#define CUDA_INIT_D3D10
-//#define CUDA_INIT_D3D11
-//#define CUDA_INIT_OPENGL
+// #define CUDA_INIT_D3D9
+// #define CUDA_INIT_D3D10
+// #define CUDA_INIT_D3D11
+// #define CUDA_INIT_OPENGL

-#include <stdio.h>
 #include "cuda_drvapi_dynlink.h"

-tcuInit                               *_cuInit;
-tcuDriverGetVersion                   *cuDriverGetVersion;
-tcuDeviceGet                          *cuDeviceGet;
-tcuDeviceGetCount                     *cuDeviceGetCount;
-tcuDeviceGetName                      *cuDeviceGetName;
-tcuDeviceComputeCapability            *cuDeviceComputeCapability;
-tcuDeviceTotalMem                     *cuDeviceTotalMem;
-tcuDeviceGetProperties                *cuDeviceGetProperties;
-tcuDeviceGetAttribute                 *cuDeviceGetAttribute;
-tcuGetErrorString                     *cuGetErrorString;
-tcuCtxCreate                          *cuCtxCreate;
-tcuCtxDestroy                         *cuCtxDestroy;
-tcuCtxAttach                          *cuCtxAttach;
-tcuCtxDetach                          *cuCtxDetach;
-tcuCtxPushCurrent                     *cuCtxPushCurrent;
-tcuCtxPopCurrent                      *cuCtxPopCurrent;
-tcuCtxGetCurrent                      *cuCtxGetCurrent;
-tcuCtxSetCurrent                      *cuCtxSetCurrent;
-tcuCtxGetDevice                       *cuCtxGetDevice;
-tcuCtxSynchronize                     *cuCtxSynchronize;
-tcuModuleLoad                         *cuModuleLoad;
-tcuModuleLoadData                     *cuModuleLoadData;
-tcuModuleLoadDataEx                   *cuModuleLoadDataEx;
-tcuModuleLoadFatBinary                *cuModuleLoadFatBinary;
-tcuModuleUnload                       *cuModuleUnload;
-tcuModuleGetFunction                  *cuModuleGetFunction;
-tcuModuleGetGlobal                    *cuModuleGetGlobal;
-tcuModuleGetTexRef                    *cuModuleGetTexRef;
-tcuModuleGetSurfRef                   *cuModuleGetSurfRef;
-tcuMemGetInfo                         *cuMemGetInfo;
-tcuMemAlloc                           *cuMemAlloc;
-tcuMemAllocPitch                      *cuMemAllocPitch;
-tcuMemFree                            *cuMemFree;
-tcuMemGetAddressRange                 *cuMemGetAddressRange;
-tcuMemAllocHost                       *cuMemAllocHost;
-tcuMemFreeHost                        *cuMemFreeHost;
-tcuMemHostAlloc                       *cuMemHostAlloc;
-tcuMemHostGetFlags                    *cuMemHostGetFlags;
+#include <stdio.h>

-tcuMemHostGetDevicePointer            *cuMemHostGetDevicePointer;
-tcuDeviceGetByPCIBusId                *cuDeviceGetByPCIBusId;
-tcuDeviceGetPCIBusId                  *cuDeviceGetPCIBusId;
-tcuIpcGetEventHandle                  *cuIpcGetEventHandle;
-tcuIpcOpenEventHandle                 *cuIpcOpenEventHandle;
-tcuIpcGetMemHandle                    *cuIpcGetMemHandle;
-tcuIpcOpenMemHandle                   *cuIpcOpenMemHandle;
-tcuIpcCloseMemHandle                  *cuIpcCloseMemHandle;
+tcuInit                    *_cuInit;
+tcuDriverGetVersion        *cuDriverGetVersion;
+tcuDeviceGet               *cuDeviceGet;
+tcuDeviceGetCount          *cuDeviceGetCount;
+tcuDeviceGetName           *cuDeviceGetName;
+tcuDeviceComputeCapability *cuDeviceComputeCapability;
+tcuDeviceTotalMem          *cuDeviceTotalMem;
+tcuDeviceGetProperties     *cuDeviceGetProperties;
+tcuDeviceGetAttribute      *cuDeviceGetAttribute;
+tcuGetErrorString          *cuGetErrorString;
+tcuCtxCreate               *cuCtxCreate;
+tcuCtxDestroy              *cuCtxDestroy;
+tcuCtxAttach               *cuCtxAttach;
+tcuCtxDetach               *cuCtxDetach;
+tcuCtxPushCurrent          *cuCtxPushCurrent;
+tcuCtxPopCurrent           *cuCtxPopCurrent;
+tcuCtxGetCurrent           *cuCtxGetCurrent;
+tcuCtxSetCurrent           *cuCtxSetCurrent;
+tcuCtxGetDevice            *cuCtxGetDevice;
+tcuCtxSynchronize          *cuCtxSynchronize;
+tcuModuleLoad              *cuModuleLoad;
+tcuModuleLoadData          *cuModuleLoadData;
+tcuModuleLoadDataEx        *cuModuleLoadDataEx;
+tcuModuleLoadFatBinary     *cuModuleLoadFatBinary;
+tcuModuleUnload            *cuModuleUnload;
+tcuModuleGetFunction       *cuModuleGetFunction;
+tcuModuleGetGlobal         *cuModuleGetGlobal;
+tcuModuleGetTexRef         *cuModuleGetTexRef;
+tcuModuleGetSurfRef        *cuModuleGetSurfRef;
+tcuMemGetInfo              *cuMemGetInfo;
+tcuMemAlloc                *cuMemAlloc;
+tcuMemAllocPitch           *cuMemAllocPitch;
+tcuMemFree                 *cuMemFree;
+tcuMemGetAddressRange      *cuMemGetAddressRange;
+tcuMemAllocHost            *cuMemAllocHost;
+tcuMemFreeHost             *cuMemFreeHost;
+tcuMemHostAlloc            *cuMemHostAlloc;
+tcuMemHostGetFlags         *cuMemHostGetFlags;

-tcuMemHostRegister                    *cuMemHostRegister;
-tcuMemHostUnregister                  *cuMemHostUnregister;
-tcuMemcpyHtoD                         *cuMemcpyHtoD;
-tcuMemcpyDtoH                         *cuMemcpyDtoH;
-tcuMemcpyDtoD                         *cuMemcpyDtoD;
-tcuMemcpyDtoA                         *cuMemcpyDtoA;
-tcuMemcpyAtoD                         *cuMemcpyAtoD;
-tcuMemcpyHtoA                         *cuMemcpyHtoA;
-tcuMemcpyAtoH                         *cuMemcpyAtoH;
-tcuMemcpyAtoA                         *cuMemcpyAtoA;
-tcuMemcpy2D                           *cuMemcpy2D;
-tcuMemcpy2DUnaligned                  *cuMemcpy2DUnaligned;
-tcuMemcpy3D                           *cuMemcpy3D;
-tcuMemcpyHtoDAsync                    *cuMemcpyHtoDAsync;
-tcuMemcpyDtoHAsync                    *cuMemcpyDtoHAsync;
-tcuMemcpyDtoDAsync                    *cuMemcpyDtoDAsync;
-tcuMemcpyHtoAAsync                    *cuMemcpyHtoAAsync;
-tcuMemcpyAtoHAsync                    *cuMemcpyAtoHAsync;
-tcuMemcpy2DAsync                      *cuMemcpy2DAsync;
-tcuMemcpy3DAsync                      *cuMemcpy3DAsync;
-tcuMemcpy                             *cuMemcpy;
-tcuMemcpyPeer                         *cuMemcpyPeer;
-tcuMemsetD8                           *cuMemsetD8;
-tcuMemsetD16                          *cuMemsetD16;
-tcuMemsetD32                          *cuMemsetD32;
-tcuMemsetD2D8                         *cuMemsetD2D8;
-tcuMemsetD2D16                        *cuMemsetD2D16;
-tcuMemsetD2D32                        *cuMemsetD2D32;
-tcuFuncSetBlockShape                  *cuFuncSetBlockShape;
-tcuFuncSetSharedSize                  *cuFuncSetSharedSize;
-tcuFuncGetAttribute                   *cuFuncGetAttribute;
-tcuFuncSetCacheConfig                 *cuFuncSetCacheConfig;
-tcuFuncSetSharedMemConfig             *cuFuncSetSharedMemConfig;
-tcuLaunchKernel                       *cuLaunchKernel;
-tcuArrayCreate                        *cuArrayCreate;
-tcuArrayGetDescriptor                 *cuArrayGetDescriptor;
-tcuArrayDestroy                       *cuArrayDestroy;
-tcuArray3DCreate                      *cuArray3DCreate;
-tcuArray3DGetDescriptor               *cuArray3DGetDescriptor;
-tcuTexRefCreate                       *cuTexRefCreate;
-tcuTexRefDestroy                      *cuTexRefDestroy;
-tcuTexRefSetArray                     *cuTexRefSetArray;
-tcuTexRefSetAddress                   *cuTexRefSetAddress;
-tcuTexRefSetAddress2D                 *cuTexRefSetAddress2D;
-tcuTexRefSetFormat                    *cuTexRefSetFormat;
-tcuTexRefSetAddressMode               *cuTexRefSetAddressMode;
-tcuTexRefSetFilterMode                *cuTexRefSetFilterMode;
-tcuTexRefSetFlags                     *cuTexRefSetFlags;
-tcuTexRefGetAddress                   *cuTexRefGetAddress;
-tcuTexRefGetArray                     *cuTexRefGetArray;
-tcuTexRefGetAddressMode               *cuTexRefGetAddressMode;
-tcuTexRefGetFilterMode                *cuTexRefGetFilterMode;
-tcuTexRefGetFormat                    *cuTexRefGetFormat;
-tcuTexRefGetFlags                     *cuTexRefGetFlags;
-tcuSurfRefSetArray                    *cuSurfRefSetArray;
-tcuSurfRefGetArray                    *cuSurfRefGetArray;
-tcuParamSetSize                       *cuParamSetSize;
-tcuParamSeti                          *cuParamSeti;
-tcuParamSetf                          *cuParamSetf;
-tcuParamSetv                          *cuParamSetv;
-tcuParamSetTexRef                     *cuParamSetTexRef;
-tcuLaunch                             *cuLaunch;
-tcuLaunchGrid                         *cuLaunchGrid;
-tcuLaunchGridAsync                    *cuLaunchGridAsync;
-tcuEventCreate                        *cuEventCreate;
-tcuEventRecord                        *cuEventRecord;
-tcuEventQuery                         *cuEventQuery;
-tcuEventSynchronize                   *cuEventSynchronize;
-tcuEventDestroy                       *cuEventDestroy;
-tcuEventElapsedTime                   *cuEventElapsedTime;
-tcuStreamCreate                       *cuStreamCreate;
-tcuStreamWaitEvent                    *cuStreamWaitEvent;
-tcuStreamAddCallback                  *cuStreamAddCallback;
-tcuStreamQuery                        *cuStreamQuery;
-tcuStreamSynchronize                  *cuStreamSynchronize;
-tcuStreamDestroy                      *cuStreamDestroy;
-tcuGraphicsUnregisterResource         *cuGraphicsUnregisterResource;
-tcuGraphicsSubResourceGetMappedArray  *cuGraphicsSubResourceGetMappedArray;
-tcuGraphicsResourceGetMappedPointer   *cuGraphicsResourceGetMappedPointer;
-tcuGraphicsResourceSetMapFlags        *cuGraphicsResourceSetMapFlags;
-tcuGraphicsMapResources               *cuGraphicsMapResources;
-tcuGraphicsUnmapResources             *cuGraphicsUnmapResources;
-tcuGetExportTable                     *cuGetExportTable;
-tcuCtxSetLimit                        *cuCtxSetLimit;
-tcuCtxGetLimit                        *cuCtxGetLimit;
-tcuCtxGetCacheConfig                  *cuCtxGetCacheConfig;
-tcuCtxSetCacheConfig                  *cuCtxSetCacheConfig;
-tcuCtxGetSharedMemConfig              *cuCtxGetSharedMemConfig;
-tcuCtxSetSharedMemConfig              *cuCtxSetSharedMemConfig;
-tcuCtxGetApiVersion                   *cuCtxGetApiVersion;
+tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
+tcuDeviceGetByPCIBusId     *cuDeviceGetByPCIBusId;
+tcuDeviceGetPCIBusId       *cuDeviceGetPCIBusId;
+tcuIpcGetEventHandle       *cuIpcGetEventHandle;
+tcuIpcOpenEventHandle      *cuIpcOpenEventHandle;
+tcuIpcGetMemHandle         *cuIpcGetMemHandle;
+tcuIpcOpenMemHandle        *cuIpcOpenMemHandle;
+tcuIpcCloseMemHandle       *cuIpcCloseMemHandle;

-tcuMipmappedArrayCreate               *cuMipmappedArrayCreate;
-tcuMipmappedArrayGetLevel             *cuMipmappedArrayGetLevel;
-tcuMipmappedArrayDestroy              *cuMipmappedArrayDestroy;
+tcuMemHostRegister                   *cuMemHostRegister;
+tcuMemHostUnregister                 *cuMemHostUnregister;
+tcuMemcpyHtoD                        *cuMemcpyHtoD;
+tcuMemcpyDtoH                        *cuMemcpyDtoH;
+tcuMemcpyDtoD                        *cuMemcpyDtoD;
+tcuMemcpyDtoA                        *cuMemcpyDtoA;
+tcuMemcpyAtoD                        *cuMemcpyAtoD;
+tcuMemcpyHtoA                        *cuMemcpyHtoA;
+tcuMemcpyAtoH                        *cuMemcpyAtoH;
+tcuMemcpyAtoA                        *cuMemcpyAtoA;
+tcuMemcpy2D                          *cuMemcpy2D;
+tcuMemcpy2DUnaligned                 *cuMemcpy2DUnaligned;
+tcuMemcpy3D                          *cuMemcpy3D;
+tcuMemcpyHtoDAsync                   *cuMemcpyHtoDAsync;
+tcuMemcpyDtoHAsync                   *cuMemcpyDtoHAsync;
+tcuMemcpyDtoDAsync                   *cuMemcpyDtoDAsync;
+tcuMemcpyHtoAAsync                   *cuMemcpyHtoAAsync;
+tcuMemcpyAtoHAsync                   *cuMemcpyAtoHAsync;
+tcuMemcpy2DAsync                     *cuMemcpy2DAsync;
+tcuMemcpy3DAsync                     *cuMemcpy3DAsync;
+tcuMemcpy                            *cuMemcpy;
+tcuMemcpyPeer                        *cuMemcpyPeer;
+tcuMemsetD8                          *cuMemsetD8;
+tcuMemsetD16                         *cuMemsetD16;
+tcuMemsetD32                         *cuMemsetD32;
+tcuMemsetD2D8                        *cuMemsetD2D8;
+tcuMemsetD2D16                       *cuMemsetD2D16;
+tcuMemsetD2D32                       *cuMemsetD2D32;
+tcuFuncSetBlockShape                 *cuFuncSetBlockShape;
+tcuFuncSetSharedSize                 *cuFuncSetSharedSize;
+tcuFuncGetAttribute                  *cuFuncGetAttribute;
+tcuFuncSetCacheConfig                *cuFuncSetCacheConfig;
+tcuFuncSetSharedMemConfig            *cuFuncSetSharedMemConfig;
+tcuLaunchKernel                      *cuLaunchKernel;
+tcuArrayCreate                       *cuArrayCreate;
+tcuArrayGetDescriptor                *cuArrayGetDescriptor;
+tcuArrayDestroy                      *cuArrayDestroy;
+tcuArray3DCreate                     *cuArray3DCreate;
+tcuArray3DGetDescriptor              *cuArray3DGetDescriptor;
+tcuTexRefCreate                      *cuTexRefCreate;
+tcuTexRefDestroy                     *cuTexRefDestroy;
+tcuTexRefSetArray                    *cuTexRefSetArray;
+tcuTexRefSetAddress                  *cuTexRefSetAddress;
+tcuTexRefSetAddress2D                *cuTexRefSetAddress2D;
+tcuTexRefSetFormat                   *cuTexRefSetFormat;
+tcuTexRefSetAddressMode              *cuTexRefSetAddressMode;
+tcuTexRefSetFilterMode               *cuTexRefSetFilterMode;
+tcuTexRefSetFlags                    *cuTexRefSetFlags;
+tcuTexRefGetAddress                  *cuTexRefGetAddress;
+tcuTexRefGetArray                    *cuTexRefGetArray;
+tcuTexRefGetAddressMode              *cuTexRefGetAddressMode;
+tcuTexRefGetFilterMode               *cuTexRefGetFilterMode;
+tcuTexRefGetFormat                   *cuTexRefGetFormat;
+tcuTexRefGetFlags                    *cuTexRefGetFlags;
+tcuSurfRefSetArray                   *cuSurfRefSetArray;
+tcuSurfRefGetArray                   *cuSurfRefGetArray;
+tcuParamSetSize                      *cuParamSetSize;
+tcuParamSeti                         *cuParamSeti;
+tcuParamSetf                         *cuParamSetf;
+tcuParamSetv                         *cuParamSetv;
+tcuParamSetTexRef                    *cuParamSetTexRef;
+tcuLaunch                            *cuLaunch;
+tcuLaunchGrid                        *cuLaunchGrid;
+tcuLaunchGridAsync                   *cuLaunchGridAsync;
+tcuEventCreate                       *cuEventCreate;
+tcuEventRecord                       *cuEventRecord;
+tcuEventQuery                        *cuEventQuery;
+tcuEventSynchronize                  *cuEventSynchronize;
+tcuEventDestroy                      *cuEventDestroy;
+tcuEventElapsedTime                  *cuEventElapsedTime;
+tcuStreamCreate                      *cuStreamCreate;
+tcuStreamWaitEvent                   *cuStreamWaitEvent;
+tcuStreamAddCallback                 *cuStreamAddCallback;
+tcuStreamQuery                       *cuStreamQuery;
+tcuStreamSynchronize                 *cuStreamSynchronize;
+tcuStreamDestroy                     *cuStreamDestroy;
+tcuGraphicsUnregisterResource        *cuGraphicsUnregisterResource;
+tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
+tcuGraphicsResourceGetMappedPointer  *cuGraphicsResourceGetMappedPointer;
+tcuGraphicsResourceSetMapFlags       *cuGraphicsResourceSetMapFlags;
+tcuGraphicsMapResources              *cuGraphicsMapResources;
+tcuGraphicsUnmapResources            *cuGraphicsUnmapResources;
+tcuGetExportTable                    *cuGetExportTable;
+tcuCtxSetLimit                       *cuCtxSetLimit;
+tcuCtxGetLimit                       *cuCtxGetLimit;
+tcuCtxGetCacheConfig                 *cuCtxGetCacheConfig;
+tcuCtxSetCacheConfig                 *cuCtxSetCacheConfig;
+tcuCtxGetSharedMemConfig             *cuCtxGetSharedMemConfig;
+tcuCtxSetSharedMemConfig             *cuCtxSetSharedMemConfig;
+tcuCtxGetApiVersion                  *cuCtxGetApiVersion;

-tcuProfilerStop                       *cuProfilerStop;
+tcuMipmappedArrayCreate   *cuMipmappedArrayCreate;
+tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
+tcuMipmappedArrayDestroy  *cuMipmappedArrayDestroy;
+
+tcuProfilerStop *cuProfilerStop;

 #ifdef CUDA_INIT_D3D9
 // D3D9/CUDA interop (CUDA 1.x compatible API). These functions
 // are deprecated; please use the ones below
-tcuD3D9Begin                          *cuD3D9Begin;
-tcuD3D9End                            *cuD3DEnd;
-tcuD3D9RegisterVertexBuffer           *cuD3D9RegisterVertexBuffer;
-tcuD3D9MapVertexBuffer                *cuD3D9MapVertexBuffer;
-tcuD3D9UnmapVertexBuffer              *cuD3D9UnmapVertexBuffer;
-tcuD3D9UnregisterVertexBuffer         *cuD3D9UnregisterVertexBuffer;
+tcuD3D9Begin                  *cuD3D9Begin;
+tcuD3D9End                    *cuD3DEnd;
+tcuD3D9RegisterVertexBuffer   *cuD3D9RegisterVertexBuffer;
+tcuD3D9MapVertexBuffer        *cuD3D9MapVertexBuffer;
+tcuD3D9UnmapVertexBuffer      *cuD3D9UnmapVertexBuffer;
+tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer;

 // D3D9/CUDA interop (CUDA 2.x compatible)
-tcuD3D9GetDirect3DDevice              *cuD3D9GetDirect3DDevice;
-tcuD3D9RegisterResource               *cuD3D9RegisterResource;
-tcuD3D9UnregisterResource             *cuD3D9UnregisterResource;
-tcuD3D9MapResources                   *cuD3D9MapResources;
-tcuD3D9UnmapResources                 *cuD3D9UnmapResources;
-tcuD3D9ResourceSetMapFlags            *cuD3D9ResourceSetMapFlags;
-tcuD3D9ResourceGetSurfaceDimensions   *cuD3D9ResourceGetSurfaceDimensions;
-tcuD3D9ResourceGetMappedArray         *cuD3D9ResourceGetMappedArray;
-tcuD3D9ResourceGetMappedPointer       *cuD3D9ResourceGetMappedPointer;
-tcuD3D9ResourceGetMappedSize          *cuD3D9ResourceGetMappedSize;
-tcuD3D9ResourceGetMappedPitch         *cuD3D9ResourceGetMappedPitch;
+tcuD3D9GetDirect3DDevice            *cuD3D9GetDirect3DDevice;
+tcuD3D9RegisterResource             *cuD3D9RegisterResource;
+tcuD3D9UnregisterResource           *cuD3D9UnregisterResource;
+tcuD3D9MapResources                 *cuD3D9MapResources;
+tcuD3D9UnmapResources               *cuD3D9UnmapResources;
+tcuD3D9ResourceSetMapFlags          *cuD3D9ResourceSetMapFlags;
+tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
+tcuD3D9ResourceGetMappedArray       *cuD3D9ResourceGetMappedArray;
+tcuD3D9ResourceGetMappedPointer     *cuD3D9ResourceGetMappedPointer;
+tcuD3D9ResourceGetMappedSize        *cuD3D9ResourceGetMappedSize;
+tcuD3D9ResourceGetMappedPitch       *cuD3D9ResourceGetMappedPitch;

 // D3D9/CUDA interop (CUDA 2.0+)
-tcuD3D9GetDevice                      *cuD3D9GetDevice;
-tcuD3D9CtxCreate                      *cuD3D9CtxCreate;
-tcuGraphicsD3D9RegisterResource       *cuGraphicsD3D9RegisterResource;
+tcuD3D9GetDevice                *cuD3D9GetDevice;
+tcuD3D9CtxCreate                *cuD3D9CtxCreate;
+tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
 #endif

 #ifdef CUDA_INIT_D3D10
 // D3D10/CUDA interop (CUDA 3.0+)
-tcuD3D10GetDevice                     *cuD3D10GetDevice;
-tcuD3D10CtxCreate                     *cuD3D10CtxCreate;
-tcuGraphicsD3D10RegisterResource      *cuGraphicsD3D10RegisterResource;
+tcuD3D10GetDevice                *cuD3D10GetDevice;
+tcuD3D10CtxCreate                *cuD3D10CtxCreate;
+tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource;
 #endif


 #ifdef CUDA_INIT_D3D11
 // D3D11/CUDA interop (CUDA 3.0+)
-tcuD3D11GetDevice                     *cuD3D11GetDevice;
-tcuD3D11CtxCreate                     *cuD3D11CtxCreate;
-tcuGraphicsD3D11RegisterResource      *cuGraphicsD3D11RegisterResource;
+tcuD3D11GetDevice                *cuD3D11GetDevice;
+tcuD3D11CtxCreate                *cuD3D11CtxCreate;
+tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource;
 #endif

 // GL/CUDA interop
 #ifdef CUDA_INIT_OPENGL
-tcuGLCtxCreate                        *cuGLCtxCreate;
-tcuGraphicsGLRegisterBuffer           *cuGraphicsGLRegisterBuffer;
-tcuGraphicsGLRegisterImage            *cuGraphicsGLRegisterImage;
+tcuGLCtxCreate              *cuGLCtxCreate;
+tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
+tcuGraphicsGLRegisterImage  *cuGraphicsGLRegisterImage;
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-tcuWGLGetDevice                       *cuWGLGetDevice;
+tcuWGLGetDevice *cuWGLGetDevice;
 #endif
 #endif

@ -239,8 +240,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 {
    *pInstance = LoadLibrary(__CudaLibName);

-    if (*pInstance == NULL)
-    {
+    if (*pInstance == NULL) {
        printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
        return CUDA_ERROR_UNKNOWN;
    }
@ -248,38 +248,35 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
    return CUDA_SUCCESS;
 }

-#define GET_PROC_EX(name, alias, required)                     \
-    alias = (t##name *)GetProcAddress(CudaDrvLib, #name);               \
-    if (alias == NULL && required) {                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
-               #name, __CudaLibName);                                  \
-        return CUDA_ERROR_UNKNOWN;                                      \
+#define GET_PROC_EX(name, alias, required)                                               \
+    alias = (t##name *)GetProcAddress(CudaDrvLib, #name);                                \
+    if (alias == NULL && required) {                                                     \
+        printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
+        return CUDA_ERROR_UNKNOWN;                                                       \
    }

-#define GET_PROC_EX_V2(name, alias, required)                           \
-    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\
-    if (alias == NULL && required) {                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
-               STRINGIFY(name##_v2), __CudaLibName);                       \
-        return CUDA_ERROR_UNKNOWN;                                      \
+#define GET_PROC_EX_V2(name, alias, required)                                                           \
+    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));                                \
+    if (alias == NULL && required) {                                                                    \
+        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
+        return CUDA_ERROR_UNKNOWN;                                                                      \
    }

-#define GET_PROC_EX_V3(name, alias, required)                           \
-    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));\
-    if (alias == NULL && required) {                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
-               STRINGIFY(name##_v3), __CudaLibName);                       \
-        return CUDA_ERROR_UNKNOWN;                                      \
+#define GET_PROC_EX_V3(name, alias, required)                                                           \
+    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));                                \
+    if (alias == NULL && required) {                                                                    \
+        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
+        return CUDA_ERROR_UNKNOWN;                                                                      \
    }

-#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX)
+#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX)

 #include <dlfcn.h>

 #if defined(__APPLE__) || defined(__MACOSX)
 static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib";
 #elif defined(__ANDROID__)
-#if defined (__aarch64__)
+#if defined(__aarch64__)
 static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so";
 #elif defined(__arm__)
 static char __CudaLibName[] = "/system/vendor/lib/libcuda.so";
@ -294,8 +291,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 {
    *pInstance = dlopen(__CudaLibName, RTLD_NOW);

-    if (*pInstance == NULL)
-    {
+    if (*pInstance == NULL) {
        printf("dlopen \"%s\" failed!\n", __CudaLibName);
        return CUDA_ERROR_UNKNOWN;
    }
@ -303,52 +299,49 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
    return CUDA_SUCCESS;
 }

-#define GET_PROC_EX(name, alias, required)                              \
-    alias = (t##name *)dlsym(CudaDrvLib, #name);                        \
-    if (alias == NULL && required) {                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
-               #name, __CudaLibName);                                  \
-        return CUDA_ERROR_UNKNOWN;                                      \
+#define GET_PROC_EX(name, alias, required)                                               \
+    alias = (t##name *)dlsym(CudaDrvLib, #name);                                         \
+    if (alias == NULL && required) {                                                     \
+        printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
+        return CUDA_ERROR_UNKNOWN;                                                       \
    }

-#define GET_PROC_EX_V2(name, alias, required)                           \
-    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2));         \
-    if (alias == NULL && required) {                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
-               STRINGIFY(name##_v2), __CudaLibName);                    \
-        return CUDA_ERROR_UNKNOWN;                                      \
+#define GET_PROC_EX_V2(name, alias, required)                                                           \
+    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2));                                         \
+    if (alias == NULL && required) {                                                                    \
+        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
+        return CUDA_ERROR_UNKNOWN;                                                                      \
    }

-#define GET_PROC_EX_V3(name, alias, required)                           \
-    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3));         \
-    if (alias == NULL && required) {                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
-               STRINGIFY(name##_v3), __CudaLibName);                    \
-        return CUDA_ERROR_UNKNOWN;                                      \
+#define GET_PROC_EX_V3(name, alias, required)                                                           \
+    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3));                                         \
+    if (alias == NULL && required) {                                                                    \
+        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
+        return CUDA_ERROR_UNKNOWN;                                                                      \
    }

 #else
 #error unsupported platform
 #endif

-#define CHECKED_CALL(call)              \
-    do {                                \
-        CUresult result = (call);       \
-        if (CUDA_SUCCESS != result) {   \
-            return result;              \
-        }                               \
-    } while(0)
+#define CHECKED_CALL(call)            \
+    do {                              \
+        CUresult result = (call);     \
+        if (CUDA_SUCCESS != result) { \
+            return result;            \
+        }                             \
+    } while (0)

-#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
-#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
+#define GET_PROC_REQUIRED(name) GET_PROC_EX(name, name, 1)
+#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name, name, 0)
 #define GET_PROC(name)          GET_PROC_REQUIRED(name)
-#define GET_PROC_V2(name)       GET_PROC_EX_V2(name,name,1)
-#define GET_PROC_V3(name)       GET_PROC_EX_V3(name,name,1)
+#define GET_PROC_V2(name)       GET_PROC_EX_V2(name, name, 1)
+#define GET_PROC_V3(name)       GET_PROC_EX_V3(name, name, 1)

 CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 {
    CUDADRIVER CudaDrvLib;
-    int driverVer = 1000;
+    int        driverVer = 1000;

    CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));

@ -359,8 +352,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    // available since 2.2. if not present, version 1.0 is assumed
    GET_PROC_OPTIONAL(cuDriverGetVersion);

-    if (cuDriverGetVersion)
-    {
+    if (cuDriverGetVersion) {
        CHECKED_CALL(cuDriverGetVersion(&driverVer));
    }

@ -428,24 +420,21 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    GET_PROC(cuStreamDestroy);

    // These are CUDA 5.0 new functions
-    if (driverVer >= 5000)
-    {
+    if (driverVer >= 5000) {
        GET_PROC(cuMipmappedArrayCreate);
        GET_PROC(cuMipmappedArrayDestroy);
        GET_PROC(cuMipmappedArrayGetLevel);
    }

    // These are CUDA 4.2 new functions
-    if (driverVer >= 4020)
-    {
+    if (driverVer >= 4020) {
        GET_PROC(cuFuncSetSharedMemConfig);
        GET_PROC(cuCtxGetSharedMemConfig);
        GET_PROC(cuCtxSetSharedMemConfig);
    }

    // These are CUDA 4.1 new functions
-    if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
-    {
+    if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
        GET_PROC(cuDeviceGetByPCIBusId);
        GET_PROC(cuDeviceGetPCIBusId);
        GET_PROC(cuIpcGetEventHandle);
@ -456,8 +445,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    }

    // These could be _v2 interfaces
-    if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000)
-    {
+    if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) {
        GET_PROC_V2(cuCtxDestroy);
        GET_PROC_V2(cuCtxPopCurrent);
        GET_PROC_V2(cuCtxPushCurrent);
@ -465,8 +453,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC_V2(cuEventDestroy);
    }

-    if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
-    {
+    if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
        GET_PROC_V2(cuDeviceTotalMem);
        GET_PROC_V2(cuCtxCreate);
        GET_PROC_V2(cuModuleGetGlobal);
@ -507,17 +494,14 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC_V2(cuTexRefSetAddress);
        GET_PROC_V2(cuTexRefGetAddress);

-        if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
-        {
+        if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
            GET_PROC_V3(cuTexRefSetAddress2D);
        }
-        else
-        {
+        else {
            GET_PROC_V2(cuTexRefSetAddress2D);
        }
    }
-    else
-    {
+    else {
        // versions earlier than 3020
        GET_PROC(cuDeviceTotalMem);
        GET_PROC(cuCtxCreate);
@ -562,8 +546,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    }

    // The following functions are specific to CUDA versions
-    if (driverVer >= 4000)
-    {
+    if (driverVer >= 4000) {
        GET_PROC(cuCtxSetCurrent);
        GET_PROC(cuCtxGetCurrent);
        GET_PROC(cuMemHostRegister);
@ -574,8 +557,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuProfilerStop);
    }

-    if (driverVer >= 3010)
-    {
+    if (driverVer >= 3010) {
        GET_PROC(cuModuleGetSurfRef);
        GET_PROC(cuSurfRefSetArray);
        GET_PROC(cuSurfRefGetArray);
@ -583,8 +565,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuCtxGetLimit);
    }

-    if (driverVer >= 3000)
-    {
+    if (driverVer >= 3000) {
        GET_PROC(cuMemcpyDtoDAsync);
        GET_PROC(cuFuncSetCacheConfig);
 #ifdef CUDA_INIT_D3D11
@ -595,12 +576,10 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuGraphicsUnregisterResource);
        GET_PROC(cuGraphicsSubResourceGetMappedArray);

-        if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
-        {
+        if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
            GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
        }
-        else
-        {
+        else {
            GET_PROC(cuGraphicsResourceGetMappedPointer);
        }

@ -610,8 +589,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuGetExportTable);
    }

-    if (driverVer >= 2030)
-    {
+    if (driverVer >= 2030) {
        GET_PROC(cuMemHostGetFlags);
 #ifdef CUDA_INIT_D3D10
        GET_PROC(cuD3D10GetDevice);
@ -624,17 +602,16 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 #endif
    }

-    if (driverVer >= 2010)
-    {
+    if (driverVer >= 2010) {
        GET_PROC(cuModuleLoadDataEx);
        GET_PROC(cuModuleLoadFatBinary);
 #ifdef CUDA_INIT_OPENGL
        GET_PROC(cuGLCtxCreate);
        GET_PROC(cuGraphicsGLRegisterBuffer);
        GET_PROC(cuGraphicsGLRegisterImage);
-#  ifdef WIN32
+#ifdef WIN32
        GET_PROC(cuWGLGetDevice);
-#  endif
+#endif
 #endif
 #ifdef CUDA_INIT_D3D9
        GET_PROC(cuD3D9GetDevice);
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h
@ -14,21 +14,17 @@
 #ifndef HELPER_CUDA_DRVAPI_H
 #define HELPER_CUDA_DRVAPI_H

+#include <helper_string.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

-#include <helper_string.h>
-
 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
 #endif

 #ifndef HELPER_CUDA_DRVAPI_H
-inline int ftoi(float value) {
-  return (value >= 0 ? static_cast<int>(value + 0.5)
-                     : static_cast<int>(value - 0.5));
-}
+inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
 #endif

 #ifndef EXIT_WAIVED
@ -47,311 +43,302 @@ inline int ftoi(float value) {
 #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)

 // These are the inline versions for all of the SDK helper functions
-inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
-  if (CUDA_SUCCESS != err) {
-    const char *errorStr = NULL;
-    cuGetErrorString(err, &errorStr);
-    fprintf(stderr,
-            "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
-            "line %i.\n",
-            err, errorStr, file, line);
-    exit(EXIT_FAILURE);
-  }
+inline void __checkCudaErrors(CUresult err, const char *file, const int line)
+{
+    if (CUDA_SUCCESS != err) {
+        const char *errorStr = NULL;
+        cuGetErrorString(err, &errorStr);
+        fprintf(stderr,
+                "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
+                "line %i.\n",
+                err,
+                errorStr,
+                file,
+                line);
+        exit(EXIT_FAILURE);
+    }
 }
 #endif

 // This function wraps the CUDA Driver API into a template function
-template <class T>
-inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
-                             int device) {
-  checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
+template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
+{
+    checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
 }
 #endif

 // Beginning of GPU Architecture definitions
-inline int _ConvertSMVer2CoresDRV(int major, int minor) {
-  // Defines for GPU Architecture types (using the SM version to determine the #
-  // of cores per SM
-  typedef struct {
-    int SM;  // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
-             // minor version
-    int Cores;
-  } sSMtoCores;
+inline int _ConvertSMVer2CoresDRV(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine the #
+    // of cores per SM
+    typedef struct
+    {
+        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
+                // minor version
+        int Cores;
+    } sSMtoCores;

-  sSMtoCores nGpuArchCoresPerSM[] = {
-      {0x30, 192},
-      {0x32, 192},
-      {0x35, 192},
-      {0x37, 192},
-      {0x50, 128},
-      {0x52, 128},
-      {0x53, 128},
-      {0x60,  64},
-      {0x61, 128},
-      {0x62, 128},
-      {0x70,  64},
-      {0x72,  64},
-      {0x75,  64},
-      {0x80,  64},
-      {0x86, 128},
-      {0x87, 128},
-      {0x90, 128},
-      {-1, -1}};
+    sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192},
+                                       {0x32, 192},
+                                       {0x35, 192},
+                                       {0x37, 192},
+                                       {0x50, 128},
+                                       {0x52, 128},
+                                       {0x53, 128},
+                                       {0x60, 64},
+                                       {0x61, 128},
+                                       {0x62, 128},
+                                       {0x70, 64},
+                                       {0x72, 64},
+                                       {0x75, 64},
+                                       {0x80, 64},
+                                       {0x86, 128},
+                                       {0x87, 128},
+                                       {0x90, 128},
+                                       {-1, -1}};

-  int index = 0;
+    int index = 0;

-  while (nGpuArchCoresPerSM[index].SM != -1) {
-    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
-      return nGpuArchCoresPerSM[index].Cores;
+    while (nGpuArchCoresPerSM[index].SM != -1) {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+            return nGpuArchCoresPerSM[index].Cores;
+        }
+
+        index++;
    }

-    index++;
-  }
-
-  // If we don't find the values, we default use the previous one to run
-  // properly
-  printf(
-      "MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
-      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
-  return nGpuArchCoresPerSM[index - 1].Cores;
+    // If we don't find the values, we default use the previous one to run
+    // properly
+    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
+           major,
+           minor,
+           nGpuArchCoresPerSM[index - 1].Cores);
+    return nGpuArchCoresPerSM[index - 1].Cores;
 }
-  // end of GPU Architecture definitions
+// end of GPU Architecture definitions

 #ifdef __cuda_cuda_h__
 // General GPU Device CUDA Initialization
-inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
-  int cuDevice = 0;
-  int deviceCount = 0;
-  checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
+inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
+{
+    int cuDevice    = 0;
+    int deviceCount = 0;
+    checkCudaErrors(cuInit(0, __CUDA_API_VERSION));

-  checkCudaErrors(cuDeviceGetCount(&deviceCount));
+    checkCudaErrors(cuDeviceGetCount(&deviceCount));

-  if (deviceCount == 0) {
-    fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
-    exit(EXIT_FAILURE);
-  }
+    if (deviceCount == 0) {
+        fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
+        exit(EXIT_FAILURE);
+    }

-  int dev = 0;
-  dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
+    int dev = 0;
+    dev     = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");

-  if (dev < 0) {
-    dev = 0;
-  }
+    if (dev < 0) {
+        dev = 0;
+    }

-  if (dev > deviceCount - 1) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
-            deviceCount);
-    fprintf(stderr,
-            ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n",
-            dev);
-    fprintf(stderr, "\n");
-    return -dev;
-  }
+    if (dev > deviceCount - 1) {
+        fprintf(stderr, "\n");
+        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
+        fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
+        fprintf(stderr, "\n");
+        return -dev;
+    }

-  checkCudaErrors(cuDeviceGet(&cuDevice, dev));
-  char name[100];
-  checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
+    checkCudaErrors(cuDeviceGet(&cuDevice, dev));
+    char name[100];
+    checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));

-  int computeMode;
-  getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
+    int computeMode;
+    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);

-  if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
-    fprintf(stderr,
-            "Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
-            "threads can use this CUDA Device.\n");
-    return -1;
-  }
+    if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
+        fprintf(stderr,
+                "Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
+                "threads can use this CUDA Device.\n");
+        return -1;
+    }

-  if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
-    printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
-  }
+    if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
+        printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
+    }

-  return dev;
+    return dev;
 }

 // This function returns the best GPU based on performance
-inline int gpuGetMaxGflopsDeviceIdDRV() {
-  CUdevice current_device = 0;
-  CUdevice max_perf_device = 0;
-  int device_count = 0;
-  int sm_per_multiproc = 0;
-  unsigned long long max_compute_perf = 0;
-  int major = 0;
-  int minor = 0;
-  int multiProcessorCount;
-  int clockRate;
-  int devices_prohibited = 0;
+inline int gpuGetMaxGflopsDeviceIdDRV()
+{
+    CUdevice           current_device   = 0;
+    CUdevice           max_perf_device  = 0;
+    int                device_count     = 0;
+    int                sm_per_multiproc = 0;
+    unsigned long long max_compute_perf = 0;
+    int                major            = 0;
+    int                minor            = 0;
+    int                multiProcessorCount;
+    int                clockRate;
+    int                devices_prohibited = 0;

-  cuInit(0, __CUDA_API_VERSION);
-  checkCudaErrors(cuDeviceGetCount(&device_count));
+    cuInit(0, __CUDA_API_VERSION);
+    checkCudaErrors(cuDeviceGetCount(&device_count));

-  if (device_count == 0) {
-    fprintf(stderr,
-            "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Find the best CUDA capable GPU device
-  current_device = 0;
-
-  while (current_device < device_count) {
-    checkCudaErrors(cuDeviceGetAttribute(
-        &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-        current_device));
-    checkCudaErrors(cuDeviceGetAttribute(
-        &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
-    checkCudaErrors(cuDeviceGetAttribute(
-        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
-    checkCudaErrors(cuDeviceGetAttribute(
-        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
-
-    int computeMode;
-    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
-                          current_device);
-
-    if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
-      if (major == 9999 && minor == 9999) {
-        sm_per_multiproc = 1;
-      } else {
-        sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
-      }
-
-      unsigned long long compute_perf =
-          (unsigned long long)(multiProcessorCount * sm_per_multiproc *
-                               clockRate);
-
-      if (compute_perf > max_compute_perf) {
-          max_compute_perf = compute_perf;
-          max_perf_device = current_device;
-      }
-    } else {
-      devices_prohibited++;
+    if (device_count == 0) {
+        fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
+        exit(EXIT_FAILURE);
    }

-    ++current_device;
-  }
+    // Find the best CUDA capable GPU device
+    current_device = 0;

-  if (devices_prohibited == device_count) {
-    fprintf(stderr,
-            "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
-            "prohibited.\n");
-    exit(EXIT_FAILURE);
-  }
+    while (current_device < device_count) {
+        checkCudaErrors(
+            cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device));
+        checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
+        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
+        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));

-  return max_perf_device;
+        int computeMode;
+        getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
+
+        if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
+            if (major == 9999 && minor == 9999) {
+                sm_per_multiproc = 1;
+            }
+            else {
+                sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
+            }
+
+            unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate);
+
+            if (compute_perf > max_compute_perf) {
+                max_compute_perf = compute_perf;
+                max_perf_device  = current_device;
+            }
+        }
+        else {
+            devices_prohibited++;
+        }
+
+        ++current_device;
+    }
+
+    if (devices_prohibited == device_count) {
+        fprintf(stderr,
+                "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
+                "prohibited.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    return max_perf_device;
 }

 // General initialization call to pick the best CUDA Device
-inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
-  CUdevice cuDevice;
-  int devID = 0;
+inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
+{
+    CUdevice cuDevice;
+    int      devID = 0;

-  // If the command-line has a device number specified, use it
-  if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
-    devID = gpuDeviceInitDRV(argc, argv);
+    // If the command-line has a device number specified, use it
+    if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
+        devID = gpuDeviceInitDRV(argc, argv);

-    if (devID < 0) {
-      printf("exiting...\n");
-      exit(EXIT_SUCCESS);
+        if (devID < 0) {
+            printf("exiting...\n");
+            exit(EXIT_SUCCESS);
+        }
+    }
+    else {
+        // Otherwise pick the device with highest Gflops/s
+        char name[100];
+        devID = gpuGetMaxGflopsDeviceIdDRV();
+        checkCudaErrors(cuDeviceGet(&cuDevice, devID));
+        cuDeviceGetName(name, 100, cuDevice);
+        printf("> Using CUDA Device [%d]: %s\n", devID, name);
    }
-  } else {
-    // Otherwise pick the device with highest Gflops/s
-    char name[100];
-    devID = gpuGetMaxGflopsDeviceIdDRV();
-    checkCudaErrors(cuDeviceGet(&cuDevice, devID));
-    cuDeviceGetName(name, 100, cuDevice);
-    printf("> Using CUDA Device [%d]: %s\n", devID, name);
-  }

-  cuDeviceGet(&cuDevice, devID);
+    cuDeviceGet(&cuDevice, devID);

-  return cuDevice;
+    return cuDevice;
 }

-inline CUdevice findIntegratedGPUDrv() {
-  CUdevice current_device = 0;
-  int device_count = 0;
-  int devices_prohibited = 0;
-  int isIntegrated;
+inline CUdevice findIntegratedGPUDrv()
+{
+    CUdevice current_device     = 0;
+    int      device_count       = 0;
+    int      devices_prohibited = 0;
+    int      isIntegrated;

-  cuInit(0, __CUDA_API_VERSION);
-  checkCudaErrors(cuDeviceGetCount(&device_count));
+    cuInit(0, __CUDA_API_VERSION);
+    checkCudaErrors(cuDeviceGetCount(&device_count));

-  if (device_count == 0) {
-    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Find the integrated GPU which is compute capable
-  while (current_device < device_count) {
-    int computeMode = -1;
-    checkCudaErrors(cuDeviceGetAttribute(
-        &isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
-    checkCudaErrors(cuDeviceGetAttribute(
-        &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
-
-    // If GPU is integrated and is not running on Compute Mode prohibited use
-    // that
-    if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
-      int major = 0, minor = 0;
-      char deviceName[256];
-      checkCudaErrors(cuDeviceGetAttribute(
-          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-          current_device));
-      checkCudaErrors(cuDeviceGetAttribute(
-          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-          current_device));
-      checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
-      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
-             current_device, deviceName, major, minor);
-
-      return current_device;
-    } else {
-      devices_prohibited++;
+    if (device_count == 0) {
+        fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+        exit(EXIT_FAILURE);
    }

-    current_device++;
-  }
+    // Find the integrated GPU which is compute capable
+    while (current_device < device_count) {
+        int computeMode = -1;
+        checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
+        checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));

-  if (devices_prohibited == device_count) {
-    fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
-    exit(EXIT_FAILURE);
-  }
+        // If GPU is integrated and is not running on Compute Mode prohibited use
+        // that
+        if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
+            int  major = 0, minor = 0;
+            char deviceName[256];
+            checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
+            checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
+            checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
+            printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor);

-  return -1;
+            return current_device;
+        }
+        else {
+            devices_prohibited++;
+        }
+
+        current_device++;
+    }
+
+    if (devices_prohibited == device_count) {
+        fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    return -1;
 }

 // General check for CUDA GPU SM Capabilities
-inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
-                                     int devID) {
-  CUdevice cuDevice;
-  char name[256];
-  int major = 0, minor = 0;
+inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
+{
+    CUdevice cuDevice;
+    char     name[256];
+    int      major = 0, minor = 0;

-  checkCudaErrors(cuDeviceGet(&cuDevice, devID));
-  checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
-  checkCudaErrors(cuDeviceGetAttribute(
-      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
-  checkCudaErrors(cuDeviceGetAttribute(
-      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+    checkCudaErrors(cuDeviceGet(&cuDevice, devID));
+    checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
+    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));

-  if ((major > major_version) ||
-      (major == major_version && minor >= minor_version)) {
-    printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name,
-           major, minor);
-    return true;
-  } else {
-    printf(
-        "No GPU device was found that can support CUDA compute capability "
-        "%d.%d.\n",
-        major_version, minor_version);
-    return false;
-  }
+    if ((major > major_version) || (major == major_version && minor >= minor_version)) {
+        printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
+        return true;
+    }
+    else {
+        printf("No GPU device was found that can support CUDA compute capability "
+               "%d.%d.\n",
+               major_version,
+               minor_version);
+        return false;
+    }
 }
 #endif

-  // end of CUDA Helper Functions
-
-#endif  // HELPER_CUDA_DRVAPI_H
+// end of CUDA Helper Functions

+#endif // HELPER_CUDA_DRVAPI_H
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul.h
@ -34,8 +34,8 @@
 #define WA (4 * block_size) // Matrix A width
 #define HA (6 * block_size) // Matrix A height
 #define WB (4 * block_size) // Matrix B width
-#define HB WA  // Matrix B height
-#define WC WB  // Matrix C width 
-#define HC HA  // Matrix C height
+#define HB WA               // Matrix B height
+#define WC WB               // Matrix C width
+#define HC HA               // Matrix C height

 #endif // _MATRIXMUL_H_
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMulDynlinkJIT.cpp
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMulDynlinkJIT.cpp
@ -43,10 +43,10 @@
 */

 // includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 // includes, CUDA
 #include "cuda_drvapi_dynlink.h"
@ -60,7 +60,7 @@
 extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);

 #if defined _MSC_VER
-#pragma warning (disable : 4312)
+#pragma warning(disable : 4312)
 #endif


@ -68,7 +68,7 @@ extern "C" void computeGold(float *, const float *, const float *, unsigned int,
 // Globals
 ////////////////////////////////////////////////////////////////////////////////
 CUcontext g_cuContext;
-bool noprompt = false;
+bool      noprompt = false;

 static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";

@ -78,8 +78,7 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
 ////////////////////////////////////////////////////////////////////////////////
 void randomInit(float *data, size_t size)
 {
-    for (size_t i = 0; i < size; ++i)
-    {
+    for (size_t i = 0; i < size; ++i) {
        data[i] = rand() / (float)RAND_MAX;
    }
 }
@ -89,33 +88,29 @@ void randomInit(float *data, size_t size)
 ////////////////////////////////////////////////////////////////////////////////
 CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size_out)
 {
-    CUresult status;
-    CUdevice cuDevice;
-    CUmodule cuModule;
+    CUresult   status;
+    CUdevice   cuDevice;
+    CUmodule   cuModule;
    CUfunction cuFunction;
-    int major, minor, block_size, devID = 0;
-    char deviceName[256];
+    int        major, minor, block_size, devID = 0;
+    char       deviceName[256];

    // link to cuda driver dynamically
    checkCudaErrors(cuInit(0, __CUDA_API_VERSION));

    // This assumes that the user is attempting to specify a explicit device -device=n
-    if (argc > 1)
-    {
+    if (argc > 1) {
        bool bFound = false;

-        for (int param=0; param < argc; param++)
-        {
-            if (!strncmp(argv[param], "-device", 7))
-            {
-                int i=(int)strlen(argv[1]);
+        for (int param = 0; param < argc; param++) {
+            if (!strncmp(argv[param], "-device", 7)) {
+                int i = (int)strlen(argv[1]);

-                while (argv[1][i] != '=')
-                {
+                while (argv[1][i] != '=') {
                    i--;
                }

-                devID = atoi(&argv[1][++i]);
+                devID  = atoi(&argv[1][++i]);
                bFound = true;
            }

@ -128,16 +123,15 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
    int deviceCount = 0;
    checkCudaErrors(cuDeviceGetCount(&deviceCount));

-    if (deviceCount == 0)
-    {
+    if (deviceCount == 0) {
        fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
        exit(EXIT_SUCCESS);
    }

-    if (devID < 0) devID = 0;
+    if (devID < 0)
+        devID = 0;

-    if (devID > deviceCount -1)
-    {
+    if (devID > deviceCount - 1) {
        fprintf(stderr, "initCUDA (Device=%d) invalid GPU device.  %d GPU device(s) detected.\n\n", devID, deviceCount);
        status = CUDA_ERROR_NOT_FOUND;

@ -153,14 +147,13 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
    checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
    printf("> Device %d: \"%s\" with Compute %d.%d capability\n", cuDevice, deviceName, major, minor);

-    block_size = 32;
+    block_size      = 32;
    *block_size_out = block_size;

    // create context for picked device
    status = cuCtxCreate(&g_cuContext, 0, cuDevice);

-    if (CUDA_SUCCESS != status)
-    {
+    if (CUDA_SUCCESS != status) {
        cuCtxDestroy(g_cuContext);
        exit(EXIT_SUCCESS);
    }
@ -169,53 +162,53 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
    {
        // in this branch we use compilation with parameters
        const unsigned int jitNumOptions = 3;
-        CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
-        void **jitOptVals = new void *[jitNumOptions];
+        CUjit_option      *jitOptions    = new CUjit_option[jitNumOptions];
+        void             **jitOptVals    = new void *[jitNumOptions];

        // set up size of compilation log buffer
-        jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+        jitOptions[0]        = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
        int jitLogBufferSize = 1024;
-        jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
+        jitOptVals[0]        = (void *)(size_t)jitLogBufferSize;

        // set up pointer to the compilation log buffer
-        jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
+        jitOptions[1]      = CU_JIT_INFO_LOG_BUFFER;
        char *jitLogBuffer = new char[jitLogBufferSize];
-        jitOptVals[1] = jitLogBuffer;
+        jitOptVals[1]      = jitLogBuffer;

        // set up pointer to set the Maximum # of registers for a particular kernel
-        jitOptions[2] = CU_JIT_MAX_REGISTERS;
+        jitOptions[2]   = CU_JIT_MAX_REGISTERS;
        int jitRegCount = 32;
-        jitOptVals[2] = (void *)(size_t)jitRegCount;
+        jitOptVals[2]   = (void *)(size_t)jitRegCount;

        // compile with set parameters
        printf("> Compiling CUDA module\n");

 #if defined(_WIN64) || defined(__LP64__)
-        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
+        status =
+            cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
 #else
-        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
+        status =
+            cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
 #endif

        printf("> PTX JIT log:\n%s\n", jitLogBuffer);

-        delete [] jitOptions;
-        delete [] jitOptVals;
-        delete [] jitLogBuffer;
+        delete[] jitOptions;
+        delete[] jitOptVals;
+        delete[] jitLogBuffer;
    }

-    if (CUDA_SUCCESS != status)
-    {
+    if (CUDA_SUCCESS != status) {
        printf("Error while compiling PTX\n");
        cuCtxDestroy(g_cuContext);
        exit(EXIT_FAILURE);
    }

    // retrieve CUDA function from the compiled module
-    status = cuModuleGetFunction(&cuFunction, cuModule,
-                                 (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
+    status = cuModuleGetFunction(
+        &cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");

-    if (CUDA_SUCCESS != status)
-    {
+    if (CUDA_SUCCESS != status) {
        cuCtxDestroy(g_cuContext);
        exit(EXIT_FAILURE);
    }
@ -233,21 +226,21 @@ int main(int argc, char **argv)
    printf("[ %s ]\n", sSDKsample);

    // initialize CUDA
-    CUfunction matrixMul = NULL;
-    int block_size = 0;
+    CUfunction matrixMul  = NULL;
+    int        block_size = 0;
    checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size));

    // set seed for rand()
    srand(2006);

    // allocate host memory for matrices A and B
-    size_t       size_A = WA * HA;
-    size_t       mem_size_A = sizeof(float) * size_A;
-    size_t       size_B = WB * HB;
-    size_t       mem_size_B = sizeof(float) * size_B;
+    size_t size_A     = WA * HA;
+    size_t mem_size_A = sizeof(float) * size_A;
+    size_t size_B     = WB * HB;
+    size_t mem_size_B = sizeof(float) * size_B;

-    float *h_A = (float *) malloc(mem_size_A);
-    float *h_B = (float *) malloc(mem_size_B);
+    float *h_A = (float *)malloc(mem_size_A);
+    float *h_B = (float *)malloc(mem_size_B);

    // initialize host memory
    randomInit(h_A, size_A);
@ -264,26 +257,24 @@ int main(int argc, char **argv)
    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));

    // allocate device memory for result
-    size_t       size_C = WC * HC;
-    size_t       mem_size_C = sizeof(float) * size_C;
+    size_t size_C     = WC * HC;
+    size_t mem_size_C = sizeof(float) * size_C;

    CUdeviceptr d_C;
    checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));

    // allocate mem for the result on host side
-    float *h_C = (float *) malloc(mem_size_C);
+    float *h_C = (float *)malloc(mem_size_C);

 #if __CUDA_API_VERSION >= 4000
    {
        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
-        int Matrix_Width_A = WA;
-        int Matrix_Width_B = WB;
-        void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B };
+        int   Matrix_Width_A = WA;
+        int   Matrix_Width_B = WB;
+        void *args[5]        = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};

-        checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1,
-                                       block_size     , block_size     , 1,
-                                       0,
-                                       NULL, args, NULL));
+        checkCudaErrors(cuLaunchKernel(
+            matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
    }
 #else // __CUDA_API_VERSION <= 3020
    {
@ -312,7 +303,7 @@ int main(int argc, char **argv)

        checkCudaErrors(cuParamSetSize(matrixMul, offset));
        checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1));
-        checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2*block_size*block_size*sizeof(float)));
+        checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2 * block_size * block_size * sizeof(float)));

        // set execution configuration for the CUDA kernel
        checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size));
@ -322,19 +313,18 @@ int main(int argc, char **argv)
    checkCudaErrors(cuCtxSynchronize());

    // copy result from device to host
-    checkCudaErrors(cuMemcpyDtoH((void *) h_C, d_C, mem_size_C));
+    checkCudaErrors(cuMemcpyDtoH((void *)h_C, d_C, mem_size_C));

    // compute reference solution
-    float *reference = (float *) malloc(mem_size_C);
+    float *reference = (float *)malloc(mem_size_C);
    computeGold(reference, h_A, h_B, HA, WA, WB);

    // check result
-    float diff=0.0f;
+    float diff = 0.0f;

-    for (unsigned int i=0; i<size_C; i++)
-    {
+    for (unsigned int i = 0; i < size_C; i++) {
        float tmp = reference[i] - h_C[i];
-        diff += tmp*tmp;
+        diff += tmp * tmp;
    }

    int res = (diff / (float)size_C < 1e-6f);
@ -349,7 +339,7 @@ int main(int argc, char **argv)
    checkCudaErrors(cuMemFree(d_C));
    checkCudaErrors(cuCtxDestroy(g_cuContext));

-    printf("Test run %s\n", (1==res) ? "success!" : "failed!");
+    printf("Test run %s\n", (1 == res) ? "success!" : "failed!");

    exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_gold.cpp
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_gold.cpp
@ -28,8 +28,7 @@

 ////////////////////////////////////////////////////////////////////////////////
 // export C interface
-extern "C"
-void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
+extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);

 ////////////////////////////////////////////////////////////////////////////////
 //! Compute reference data set
@ -40,16 +39,13 @@ void computeGold(float *, const float *, const float *, unsigned int, unsigned i
 //! @param hA         height of matrix A
 //! @param wB         width of matrix B
 ////////////////////////////////////////////////////////////////////////////////
-void
-computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
+void computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
 {
    for (unsigned int i = 0; i < hA; ++i)
-        for (unsigned int j = 0; j < wB; ++j)
-        {
+        for (unsigned int j = 0; j < wB; ++j) {
            double sum = 0;

-            for (unsigned int k = 0; k < wA; ++k)
-            {
+            for (unsigned int k = 0; k < wA; ++k) {
                double a = A[i * wA + k];
                double b = B[k * wB + j];
                sum += a * b;
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.c
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.c
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.h
@ -32,7 +32,8 @@
 #define __matrixMul_kernel_32_ptxdump_h__

 #if defined __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

    extern unsigned char matrixMul_kernel_32_ptxdump[25784];
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.c
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.c
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.h
@ -32,7 +32,8 @@
 #define __matrixMul_kernel_64_ptxdump_h__

 #if defined __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

    extern unsigned char matrixMul_kernel_64_ptxdump[26489];
--- a/Samples/0_Introduction/matrixMul_nvrtc/matrixMul.cpp
+++ b/Samples/0_Introduction/matrixMul_nvrtc/matrixMul.cpp
@ -42,207 +42,208 @@
 */

 // System includes
-#include <stdio.h>
 #include <assert.h>
+#include <stdio.h>

 // CUDA runtime
 #include <cuda_runtime.h>
+
 #include "nvrtc_helper.h"

 // Helper functions and utilities to work with CUDA
 #include <helper_functions.h>

-void constantInit(float *data, int size, float val) {
-  for (int i = 0; i < size; ++i) {
-    data[i] = val;
-  }
+void constantInit(float *data, int size, float val)
+{
+    for (int i = 0; i < size; ++i) {
+        data[i] = val;
+    }
 }

 /**
 * Run a simple test of matrix multiplication using CUDA
 */
-int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
-                   dim3 &dimsB) {
-  // Allocate host memory for matrices A and B
-  unsigned int size_A = dimsA.x * dimsA.y;
-  unsigned int mem_size_A = sizeof(float) * size_A;
-  float *h_A = (float *)malloc(mem_size_A);
-  unsigned int size_B = dimsB.x * dimsB.y;
-  unsigned int mem_size_B = sizeof(float) * size_B;
-  float *h_B = (float *)malloc(mem_size_B);
+int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
+{
+    // Allocate host memory for matrices A and B
+    unsigned int size_A     = dimsA.x * dimsA.y;
+    unsigned int mem_size_A = sizeof(float) * size_A;
+    float       *h_A        = (float *)malloc(mem_size_A);
+    unsigned int size_B     = dimsB.x * dimsB.y;
+    unsigned int mem_size_B = sizeof(float) * size_B;
+    float       *h_B        = (float *)malloc(mem_size_B);

-  // Initialize host memory
-  const float valB = 0.01f;
-  constantInit(h_A, size_A, 1.0f);
-  constantInit(h_B, size_B, valB);
+    // Initialize host memory
+    const float valB = 0.01f;
+    constantInit(h_A, size_A, 1.0f);
+    constantInit(h_B, size_B, valB);

-  // Allocate device memory
-  CUdeviceptr d_A, d_B, d_C;
+    // Allocate device memory
+    CUdeviceptr d_A, d_B, d_C;

-  char *cubin, *kernel_file;
-  size_t cubinSize;
+    char  *cubin, *kernel_file;
+    size_t cubinSize;

-  kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
-  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
+    kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
+    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);

-  CUmodule module = loadCUBIN(cubin, argc, argv);
+    CUmodule module = loadCUBIN(cubin, argc, argv);

-  // Allocate host matrix C
-  dim3 dimsC(dimsB.x, dimsA.y, 1);
-  unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
-  float *h_C = (float *)malloc(mem_size_C);
+    // Allocate host matrix C
+    dim3         dimsC(dimsB.x, dimsA.y, 1);
+    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
+    float       *h_C        = (float *)malloc(mem_size_C);

-  if (h_C == NULL) {
-    fprintf(stderr, "Failed to allocate host matrix C!\n");
-    exit(EXIT_FAILURE);
-  }
-
-  checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
-  checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
-  checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
-
-  // copy host memory to device
-  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
-  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
-
-  // Setup execution parameters
-  dim3 threads(block_size, block_size);
-  dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
-
-  // Create and start timer
-  printf("Computing result using CUDA Kernel...\n");
-
-  CUfunction kernel_addr;
-  if (block_size == 16) {
-    checkCudaErrors(
-        cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
-  } else {
-    checkCudaErrors(
-        cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
-  }
-
-  void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
-                 (void *)&dimsB.x};
-
-  // Execute the kernel
-  int nIter = 300;
-
-  for (int j = 0; j < nIter; j++) {
-    checkCudaErrors(
-        cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
-                       threads.x, threads.y, threads.z,     /* block dim */
-                       0, 0,    /* shared mem, stream */
-                       &arr[0], /* arguments */
-                       0));
-
-    checkCudaErrors(cuCtxSynchronize());
-  }
-
-  // Copy result from device to host
-  checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
-
-  printf("Checking computed result for correctness: ");
-
-  bool correct = true;
-
-  // test relative error by the formula
-  //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
-
-  double eps = 1.e-6;  // machine zero
-
-  for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
-    double abs_err = fabs(h_C[i] - (dimsA.x * valB));
-    double dot_length = dimsA.x;
-    double abs_val = fabs(h_C[i]);
-    double rel_err = abs_err / abs_val / dot_length;
-
-    if (rel_err > eps) {
-      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
-             h_C[i], dimsA.x * valB, eps);
-      correct = false;
+    if (h_C == NULL) {
+        fprintf(stderr, "Failed to allocate host matrix C!\n");
+        exit(EXIT_FAILURE);
    }
-  }

-  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
+    checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
+    checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
+    checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));

-  printf(
-      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
-      "Results may vary when GPU Boost is enabled.\n");
+    // copy host memory to device
+    checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
+    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));

-  // Clean up memory
-  free(h_A);
-  free(h_B);
-  free(h_C);
+    // Setup execution parameters
+    dim3 threads(block_size, block_size);
+    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);

-  checkCudaErrors(cuMemFree(d_A));
-  checkCudaErrors(cuMemFree(d_B));
-  checkCudaErrors(cuMemFree(d_C));
+    // Create and start timer
+    printf("Computing result using CUDA Kernel...\n");

-  if (correct) {
-    return EXIT_SUCCESS;
-  } else {
-    return EXIT_FAILURE;
-  }
+    CUfunction kernel_addr;
+    if (block_size == 16) {
+        checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
+    }
+    else {
+        checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
+    }
+
+    void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x};
+
+    // Execute the kernel
+    int nIter = 300;
+
+    for (int j = 0; j < nIter; j++) {
+        checkCudaErrors(cuLaunchKernel(kernel_addr,
+                                       grid.x,
+                                       grid.y,
+                                       grid.z, /* grid dim */
+                                       threads.x,
+                                       threads.y,
+                                       threads.z, /* block dim */
+                                       0,
+                                       0,       /* shared mem, stream */
+                                       &arr[0], /* arguments */
+                                       0));
+
+        checkCudaErrors(cuCtxSynchronize());
+    }
+
+    // Copy result from device to host
+    checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
+
+    printf("Checking computed result for correctness: ");
+
+    bool correct = true;
+
+    // test relative error by the formula
+    //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
+
+    double eps = 1.e-6; // machine zero
+
+    for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
+        double abs_err    = fabs(h_C[i] - (dimsA.x * valB));
+        double dot_length = dimsA.x;
+        double abs_val    = fabs(h_C[i]);
+        double rel_err    = abs_err / abs_val / dot_length;
+
+        if (rel_err > eps) {
+            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
+            correct = false;
+        }
+    }
+
+    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
+
+    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
+           "Results may vary when GPU Boost is enabled.\n");
+
+    // Clean up memory
+    free(h_A);
+    free(h_B);
+    free(h_C);
+
+    checkCudaErrors(cuMemFree(d_A));
+    checkCudaErrors(cuMemFree(d_B));
+    checkCudaErrors(cuMemFree(d_C));
+
+    if (correct) {
+        return EXIT_SUCCESS;
+    }
+    else {
+        return EXIT_FAILURE;
+    }
 }

 /**
 * Program main
 */

-int main(int argc, char **argv) {
-  printf("[Matrix Multiply Using CUDA] - Starting...\n");
+int main(int argc, char **argv)
+{
+    printf("[Matrix Multiply Using CUDA] - Starting...\n");

-  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
-      checkCmdLineFlag(argc, (const char **)argv, "?")) {
-    printf("Usage -device=n (n >= 0 for deviceID)\n");
-    printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
-    printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
-    printf(
-        "  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
+    if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
+        printf("Usage -device=n (n >= 0 for deviceID)\n");
+        printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
+        printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
+        printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");

-    exit(EXIT_SUCCESS);
-  }
+        exit(EXIT_SUCCESS);
+    }

-  int block_size = 32;
+    int block_size = 32;

-  // original:
-  dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
-  dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
+    // original:
+    dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
+    dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);

-  // reduce sizes to avoid running out of memory
-  // dim3 dimsA(32,32, 1);
-  // dim3 dimsB(32,32,1);
+    // reduce sizes to avoid running out of memory
+    // dim3 dimsA(32,32, 1);
+    // dim3 dimsB(32,32,1);

-  // width of Matrix A
-  if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
-    dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
-  }
+    // width of Matrix A
+    if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
+        dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
+    }

-  // height of Matrix A
-  if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
-    dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
-  }
+    // height of Matrix A
+    if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
+        dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
+    }

-  // width of Matrix B
-  if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
-    dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
-  }
+    // width of Matrix B
+    if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
+        dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
+    }

-  // height of Matrix B
-  if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
-    dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
-  }
+    // height of Matrix B
+    if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
+        dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
+    }

-  if (dimsA.x != dimsB.y) {
-    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
-           dimsA.x, dimsB.y);
-    exit(EXIT_FAILURE);
-  }
+    if (dimsA.x != dimsB.y) {
+        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
+        exit(EXIT_FAILURE);
+    }

-  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
-         dimsB.y);
+    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);

-  int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
+    int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);

-  exit(matrix_result);
+    exit(matrix_result);
 }
--- a/Samples/0_Introduction/matrixMul_nvrtc/matrixMul_kernel.cu
+++ b/Samples/0_Introduction/matrixMul_nvrtc/matrixMul_kernel.cu
@ -48,84 +48,83 @@

 #include <cooperative_groups.h>

-template <int BLOCK_SIZE>
-__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
-  // Handle to thread block group
-  cooperative_groups::thread_block cta =
-      cooperative_groups::this_thread_block();
-  // Block index
-  int bx = blockIdx.x;
-  int by = blockIdx.y;
+template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
+{
+    // Handle to thread block group
+    cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
+    // Block index
+    int bx = blockIdx.x;
+    int by = blockIdx.y;

-  // Thread index
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;

-  // Index of the first sub-matrix of A processed by the block
-  int aBegin = wA * BLOCK_SIZE * by;
+    // Index of the first sub-matrix of A processed by the block
+    int aBegin = wA * BLOCK_SIZE * by;

-  // Index of the last sub-matrix of A processed by the block
-  int aEnd = aBegin + wA - 1;
+    // Index of the last sub-matrix of A processed by the block
+    int aEnd = aBegin + wA - 1;

-  // Step size used to iterate through the sub-matrices of A
-  int aStep = BLOCK_SIZE;
+    // Step size used to iterate through the sub-matrices of A
+    int aStep = BLOCK_SIZE;

-  // Index of the first sub-matrix of B processed by the block
-  int bBegin = BLOCK_SIZE * bx;
+    // Index of the first sub-matrix of B processed by the block
+    int bBegin = BLOCK_SIZE * bx;

-  // Step size used to iterate through the sub-matrices of B
-  int bStep = BLOCK_SIZE * wB;
+    // Step size used to iterate through the sub-matrices of B
+    int bStep = BLOCK_SIZE * wB;

-  // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
-  float Csub = 0;
+    // Csub is used to store the element of the block sub-matrix
+    // that is computed by the thread
+    float Csub = 0;

-  // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
-  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
-    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+        // Declaration of the shared memory array As used to
+        // store the sub-matrix of A
+        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
-    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+        // Declaration of the shared memory array Bs used to
+        // store the sub-matrix of B
+        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

-    // Load the matrices from device memory
-    // to shared memory; each thread loads
-    // one element of each matrix
-    As[ty][tx] = A[a + wA * ty + tx];
-    Bs[ty][tx] = B[b + wB * ty + tx];
+        // Load the matrices from device memory
+        // to shared memory; each thread loads
+        // one element of each matrix
+        As[ty][tx] = A[a + wA * ty + tx];
+        Bs[ty][tx] = B[b + wB * ty + tx];

-    // Synchronize to make sure the matrices are loaded
-    cooperative_groups::sync(cta);
+        // Synchronize to make sure the matrices are loaded
+        cooperative_groups::sync(cta);

 // Multiply the two matrices together;
 // each thread computes one element
 // of the block sub-matrix
 #pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Csub += As[ty][k] * Bs[k][tx];
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+            Csub += As[ty][k] * Bs[k][tx];
+        }
+
+        // Synchronize to make sure that the preceding
+        // computation is done before loading two new
+        // sub-matrices of A and B in the next iteration
+        cooperative_groups::sync(cta);
    }

-    // Synchronize to make sure that the preceding
-    // computation is done before loading two new
-    // sub-matrices of A and B in the next iteration
-    cooperative_groups::sync(cta);
-  }
-
-  // Write the block sub-matrix to device memory;
-  // each thread writes one element
-  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
-  C[c + wB * ty + tx] = Csub;
+    // Write the block sub-matrix to device memory;
+    // each thread writes one element
+    int c               = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
+    C[c + wB * ty + tx] = Csub;
 }

-extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
-                                                 int wA, int wB) {
-  matrixMulCUDA<16>(C, A, B, wA, wB);
+extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
+{
+    matrixMulCUDA<16>(C, A, B, wA, wB);
 }

-extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
-                                                 int wA, int wB) {
-  matrixMulCUDA<32>(C, A, B, wA, wB);
+extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
+{
+    matrixMulCUDA<32>(C, A, B, wA, wB);
 }
--- a/Samples/0_Introduction/mergeSort/bitonic.cu
+++ b/Samples/0_Introduction/mergeSort/bitonic.cu
@ -28,252 +28,254 @@
 #include <cooperative_groups.h>

 namespace cg = cooperative_groups;
-#include <helper_cuda.h>
 #include <assert.h>
+#include <helper_cuda.h>
+
 #include "mergeSort_common.h"

-inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
-                                  uint &valB, uint arrowDir) {
-  uint t;
+inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir)
+{
+    uint t;

-  if ((keyA > keyB) == arrowDir) {
-    t = keyA;
-    keyA = keyB;
-    keyB = t;
-    t = valA;
-    valA = valB;
-    valB = t;
-  }
+    if ((keyA > keyB) == arrowDir) {
+        t    = keyA;
+        keyA = keyB;
+        keyB = t;
+        t    = valA;
+        valA = valB;
+        valB = t;
+    }
 }

-__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
-                                        uint *d_SrcKey, uint *d_SrcVal,
-                                        uint arrayLength, uint sortDir) {
-  // Handle to thread block group
-  cg::thread_block cta = cg::this_thread_block();
-  // Shared memory storage for one or more short vectors
-  __shared__ uint s_key[SHARED_SIZE_LIMIT];
-  __shared__ uint s_val[SHARED_SIZE_LIMIT];
+__global__ void
+bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir)
+{
+    // Handle to thread block group
+    cg::thread_block cta = cg::this_thread_block();
+    // Shared memory storage for one or more short vectors
+    __shared__ uint s_key[SHARED_SIZE_LIMIT];
+    __shared__ uint s_val[SHARED_SIZE_LIMIT];

-  // Offset to the beginning of subbatch and load data
-  d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  s_key[threadIdx.x + 0] = d_SrcKey[0];
-  s_val[threadIdx.x + 0] = d_SrcVal[0];
-  s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
-      d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
-  s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
-      d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
+    // Offset to the beginning of subbatch and load data
+    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    s_key[threadIdx.x + 0]                       = d_SrcKey[0];
+    s_val[threadIdx.x + 0]                       = d_SrcVal[0];
+    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
+    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];

-  for (uint size = 2; size < arrayLength; size <<= 1) {
-    // Bitonic merge
-    uint dir = (threadIdx.x & (size / 2)) != 0;
+    for (uint size = 2; size < arrayLength; size <<= 1) {
+        // Bitonic merge
+        uint dir = (threadIdx.x & (size / 2)) != 0;

-    for (uint stride = size / 2; stride > 0; stride >>= 1) {
-      cg::sync(cta);
-      uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-      Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
-                 s_val[pos + stride], dir);
+        for (uint stride = size / 2; stride > 0; stride >>= 1) {
+            cg::sync(cta);
+            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+            Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir);
+        }
    }
-  }

-  // ddd == sortDir for the last bitonic merge step
-  {
-    for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
-      cg::sync(cta);
-      uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-      Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
-                 s_val[pos + stride], sortDir);
+    // ddd == sortDir for the last bitonic merge step
+    {
+        for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
+            cg::sync(cta);
+            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+            Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir);
+        }
    }
-  }

-  cg::sync(cta);
-  d_DstKey[0] = s_key[threadIdx.x + 0];
-  d_DstVal[0] = s_val[threadIdx.x + 0];
-  d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
-      s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-  d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
-      s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+    cg::sync(cta);
+    d_DstKey[0]                       = s_key[threadIdx.x + 0];
+    d_DstVal[0]                       = s_val[threadIdx.x + 0];
+    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 }

 // Helper function (also used by odd-even merge sort)
-extern "C" uint factorRadix2(uint *log2L, uint L) {
-  if (!L) {
-    *log2L = 0;
-    return 0;
-  } else {
-    for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
-      ;
+extern "C" uint factorRadix2(uint *log2L, uint L)
+{
+    if (!L) {
+        *log2L = 0;
+        return 0;
+    }
+    else {
+        for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
+            ;

-    return L;
-  }
+        return L;
+    }
 }

-extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
-                                  uint *d_SrcKey, uint *d_SrcVal,
-                                  uint batchSize, uint arrayLength,
-                                  uint sortDir) {
-  // Nothing to sort
-  if (arrayLength < 2) {
-    return;
-  }
+extern "C" void bitonicSortShared(uint *d_DstKey,
+                                  uint *d_DstVal,
+                                  uint *d_SrcKey,
+                                  uint *d_SrcVal,
+                                  uint  batchSize,
+                                  uint  arrayLength,
+                                  uint  sortDir)
+{
+    // Nothing to sort
+    if (arrayLength < 2) {
+        return;
+    }

-  // Only power-of-two array lengths are supported by this implementation
-  uint log2L;
-  uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
-  assert(factorizationRemainder == 1);
+    // Only power-of-two array lengths are supported by this implementation
+    uint log2L;
+    uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
+    assert(factorizationRemainder == 1);

-  uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
-  uint threadCount = SHARED_SIZE_LIMIT / 2;
+    uint blockCount  = batchSize * arrayLength / SHARED_SIZE_LIMIT;
+    uint threadCount = SHARED_SIZE_LIMIT / 2;

-  assert(arrayLength <= SHARED_SIZE_LIMIT);
-  assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
+    assert(arrayLength <= SHARED_SIZE_LIMIT);
+    assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);

-  bitonicSortSharedKernel<<<blockCount, threadCount>>>(
-      d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
-  getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
+    bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
+    getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 3: merge elementary intervals
 ////////////////////////////////////////////////////////////////////////////////
-static inline __host__ __device__ uint iDivUp(uint a, uint b) {
-  return ((a % b) == 0) ? (a / b) : (a / b + 1);
-}
+static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }

-static inline __host__ __device__ uint getSampleCount(uint dividend) {
-  return iDivUp(dividend, SAMPLE_STRIDE);
+static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
+
+template <uint sortDir>
+static inline __device__ void
+ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir)
+{
+    uint t;
+
+    if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1))
+        || ((arrowDir != sortDir) && (flagB == 1))) {
+        t     = keyA;
+        keyA  = keyB;
+        keyB  = t;
+        t     = valA;
+        valA  = valB;
+        valB  = t;
+        t     = flagA;
+        flagA = flagB;
+        flagB = t;
+    }
 }

 template <uint sortDir>
-static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
-                                                 uint &flagA, uint &keyB,
-                                                 uint &valB, uint &flagB,
-                                                 uint arrowDir) {
-  uint t;
+__global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
+                                                      uint *d_DstVal,
+                                                      uint *d_SrcKey,
+                                                      uint *d_SrcVal,
+                                                      uint *d_LimitsA,
+                                                      uint *d_LimitsB,
+                                                      uint  stride,
+                                                      uint  N)
+{
+    // Handle to thread block group
+    cg::thread_block cta = cg::this_thread_block();
+    __shared__ uint  s_key[2 * SAMPLE_STRIDE];
+    __shared__ uint  s_val[2 * SAMPLE_STRIDE];
+    __shared__ uint  s_inf[2 * SAMPLE_STRIDE];

-  if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) ||
-      ((arrowDir == sortDir) && (flagA == 1)) ||
-      ((arrowDir != sortDir) && (flagB == 1))) {
-    t = keyA;
-    keyA = keyB;
-    keyB = t;
-    t = valA;
-    valA = valB;
-    valB = t;
-    t = flagA;
-    flagA = flagB;
-    flagB = t;
-  }
-}
+    const uint intervalI   = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
+    const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
+    d_SrcKey += segmentBase;
+    d_SrcVal += segmentBase;
+    d_DstKey += segmentBase;
+    d_DstVal += segmentBase;

-template <uint sortDir>
-__global__ void bitonicMergeElementaryIntervalsKernel(
-    uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal,
-    uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) {
-  // Handle to thread block group
-  cg::thread_block cta = cg::this_thread_block();
-  __shared__ uint s_key[2 * SAMPLE_STRIDE];
-  __shared__ uint s_val[2 * SAMPLE_STRIDE];
-  __shared__ uint s_inf[2 * SAMPLE_STRIDE];
+    // Set up threadblock-wide parameters
+    __shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;

-  const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
-  const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
-  d_SrcKey += segmentBase;
-  d_SrcVal += segmentBase;
-  d_DstKey += segmentBase;
-  d_DstVal += segmentBase;
+    if (threadIdx.x == 0) {
+        uint segmentElementsA = stride;
+        uint segmentElementsB = umin(stride, N - segmentBase - stride);
+        uint segmentSamplesA  = stride / SAMPLE_STRIDE;
+        uint segmentSamplesB  = getSampleCount(segmentElementsB);
+        uint segmentSamples   = segmentSamplesA + segmentSamplesB;

-  // Set up threadblock-wide parameters
-  __shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
+        startSrcA = d_LimitsA[blockIdx.x];
+        startSrcB = d_LimitsB[blockIdx.x];
+        startDst  = startSrcA + startSrcB;

-  if (threadIdx.x == 0) {
-    uint segmentElementsA = stride;
-    uint segmentElementsB = umin(stride, N - segmentBase - stride);
-    uint segmentSamplesA = stride / SAMPLE_STRIDE;
-    uint segmentSamplesB = getSampleCount(segmentElementsB);
-    uint segmentSamples = segmentSamplesA + segmentSamplesB;
+        uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
+        uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
+        lenSrcA      = endSrcA - startSrcA;
+        lenSrcB      = endSrcB - startSrcB;
+    }

-    startSrcA = d_LimitsA[blockIdx.x];
-    startSrcB = d_LimitsB[blockIdx.x];
-    startDst = startSrcA + startSrcB;
+    s_inf[threadIdx.x + 0]             = 1;
+    s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;

-    uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
-                                                    : segmentElementsA;
-    uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
-                                                    : segmentElementsB;
-    lenSrcA = endSrcA - startSrcA;
-    lenSrcB = endSrcB - startSrcB;
-  }
-
-  s_inf[threadIdx.x + 0] = 1;
-  s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
-
-  // Load input data
-  cg::sync(cta);
-
-  if (threadIdx.x < lenSrcA) {
-    s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
-    s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
-    s_inf[threadIdx.x] = 0;
-  }
-
-  // Prepare for bitonic merge by inversing the ordering
-  if (threadIdx.x < lenSrcB) {
-    s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
-        d_SrcKey[stride + startSrcB + threadIdx.x];
-    s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
-        d_SrcVal[stride + startSrcB + threadIdx.x];
-    s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
-  }
-
-  //"Extended" bitonic merge
-  for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
+    // Load input data
    cg::sync(cta);
-    uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-    ComparatorExtended<sortDir>(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0],
-                                s_key[pos + stride], s_val[pos + stride],
-                                s_inf[pos + stride], sortDir);
-  }

-  // Store sorted data
-  cg::sync(cta);
-  d_DstKey += startDst;
-  d_DstVal += startDst;
+    if (threadIdx.x < lenSrcA) {
+        s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
+        s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
+        s_inf[threadIdx.x] = 0;
+    }

-  if (threadIdx.x < lenSrcA) {
-    d_DstKey[threadIdx.x] = s_key[threadIdx.x];
-    d_DstVal[threadIdx.x] = s_val[threadIdx.x];
-  }
+    // Prepare for bitonic merge by inversing the ordering
+    if (threadIdx.x < lenSrcB) {
+        s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
+        s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
+        s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
+    }

-  if (threadIdx.x < lenSrcB) {
-    d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
-    d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
-  }
+    //"Extended" bitonic merge
+    for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
+        cg::sync(cta);
+        uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+        ComparatorExtended<sortDir>(s_key[pos + 0],
+                                    s_val[pos + 0],
+                                    s_inf[pos + 0],
+                                    s_key[pos + stride],
+                                    s_val[pos + stride],
+                                    s_inf[pos + stride],
+                                    sortDir);
+    }
+
+    // Store sorted data
+    cg::sync(cta);
+    d_DstKey += startDst;
+    d_DstVal += startDst;
+
+    if (threadIdx.x < lenSrcA) {
+        d_DstKey[threadIdx.x] = s_key[threadIdx.x];
+        d_DstVal[threadIdx.x] = s_val[threadIdx.x];
+    }
+
+    if (threadIdx.x < lenSrcB) {
+        d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
+        d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
+    }
 }

-extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
-                                                uint *d_SrcKey, uint *d_SrcVal,
+extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
+                                                uint *d_DstVal,
+                                                uint *d_SrcKey,
+                                                uint *d_SrcVal,
                                                uint *d_LimitsA,
-                                                uint *d_LimitsB, uint stride,
-                                                uint N, uint sortDir) {
-  uint lastSegmentElements = N % (2 * stride);
+                                                uint *d_LimitsB,
+                                                uint  stride,
+                                                uint  N,
+                                                uint  sortDir)
+{
+    uint lastSegmentElements = N % (2 * stride);

-  uint mergePairs = (lastSegmentElements > stride)
-                        ? getSampleCount(N)
-                        : (N - lastSegmentElements) / SAMPLE_STRIDE;
+    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;

-  if (sortDir) {
-    bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
-        N);
-    getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
-  } else {
-    bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
-        N);
-    getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
-  }
+    if (sortDir) {
+        bitonicMergeElementaryIntervalsKernel<1U>
+            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
+        getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
+    }
+    else {
+        bitonicMergeElementaryIntervalsKernel<0U>
+            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
+        getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
+    }
 }
--- a/Samples/0_Introduction/mergeSort/main.cpp
+++ b/Samples/0_Introduction/mergeSort/main.cpp
@ -26,96 +26,94 @@
 */

 #include <assert.h>
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#include <helper_functions.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cuda_runtime.h>
-#include <helper_functions.h>
-#include <helper_cuda.h>
+
 #include "mergeSort_common.h"

 ////////////////////////////////////////////////////////////////////////////////
 // Test driver
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
-  uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
-  StopWatchInterface *hTimer = NULL;
+int main(int argc, char **argv)
+{
+    uint               *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
+    uint               *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
+    StopWatchInterface *hTimer = NULL;

-  const uint N = 4 * 1048576;
-  const uint DIR = 1;
-  const uint numValues = 65536;
+    const uint N         = 4 * 1048576;
+    const uint DIR       = 1;
+    const uint numValues = 65536;

-  printf("%s Starting...\n\n", argv[0]);
+    printf("%s Starting...\n\n", argv[0]);

-  int dev = findCudaDevice(argc, (const char **)argv);
+    int dev = findCudaDevice(argc, (const char **)argv);

-  if (dev == -1) {
-    return EXIT_FAILURE;
-  }
+    if (dev == -1) {
+        return EXIT_FAILURE;
+    }

-  printf("Allocating and initializing host arrays...\n\n");
-  sdkCreateTimer(&hTimer);
-  h_SrcKey = (uint *)malloc(N * sizeof(uint));
-  h_SrcVal = (uint *)malloc(N * sizeof(uint));
-  h_DstKey = (uint *)malloc(N * sizeof(uint));
-  h_DstVal = (uint *)malloc(N * sizeof(uint));
+    printf("Allocating and initializing host arrays...\n\n");
+    sdkCreateTimer(&hTimer);
+    h_SrcKey = (uint *)malloc(N * sizeof(uint));
+    h_SrcVal = (uint *)malloc(N * sizeof(uint));
+    h_DstKey = (uint *)malloc(N * sizeof(uint));
+    h_DstVal = (uint *)malloc(N * sizeof(uint));

-  srand(2009);
+    srand(2009);

-  for (uint i = 0; i < N; i++) {
-    h_SrcKey[i] = rand() % numValues;
-  }
+    for (uint i = 0; i < N; i++) {
+        h_SrcKey[i] = rand() % numValues;
+    }

-  fillValues(h_SrcVal, N);
+    fillValues(h_SrcVal, N);

-  printf("Allocating and initializing CUDA arrays...\n\n");
-  checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
-  checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
-  checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
-  checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
-  checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
-  checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
-  checkCudaErrors(
-      cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
-  checkCudaErrors(
-      cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
+    printf("Allocating and initializing CUDA arrays...\n\n");
+    checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
+    checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));

-  printf("Initializing GPU merge sort...\n");
-  initMergeSort();
+    printf("Initializing GPU merge sort...\n");
+    initMergeSort();

-  printf("Running GPU merge sort...\n");
-  checkCudaErrors(cudaDeviceSynchronize());
-  sdkResetTimer(&hTimer);
-  sdkStartTimer(&hTimer);
-  mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
-  checkCudaErrors(cudaDeviceSynchronize());
-  sdkStopTimer(&hTimer);
-  printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
+    printf("Running GPU merge sort...\n");
+    checkCudaErrors(cudaDeviceSynchronize());
+    sdkResetTimer(&hTimer);
+    sdkStartTimer(&hTimer);
+    mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
+    checkCudaErrors(cudaDeviceSynchronize());
+    sdkStopTimer(&hTimer);
+    printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));

-  printf("Reading back GPU merge sort results...\n");
-  checkCudaErrors(
-      cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
-  checkCudaErrors(
-      cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
+    printf("Reading back GPU merge sort results...\n");
+    checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));

-  printf("Inspecting the results...\n");
-  uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
+    printf("Inspecting the results...\n");
+    uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);

-  uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
+    uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);

-  printf("Shutting down...\n");
-  closeMergeSort();
-  sdkDeleteTimer(&hTimer);
-  checkCudaErrors(cudaFree(d_SrcVal));
-  checkCudaErrors(cudaFree(d_SrcKey));
-  checkCudaErrors(cudaFree(d_BufVal));
-  checkCudaErrors(cudaFree(d_BufKey));
-  checkCudaErrors(cudaFree(d_DstVal));
-  checkCudaErrors(cudaFree(d_DstKey));
-  free(h_DstVal);
-  free(h_DstKey);
-  free(h_SrcVal);
-  free(h_SrcKey);
+    printf("Shutting down...\n");
+    closeMergeSort();
+    sdkDeleteTimer(&hTimer);
+    checkCudaErrors(cudaFree(d_SrcVal));
+    checkCudaErrors(cudaFree(d_SrcKey));
+    checkCudaErrors(cudaFree(d_BufVal));
+    checkCudaErrors(cudaFree(d_BufKey));
+    checkCudaErrors(cudaFree(d_DstVal));
+    checkCudaErrors(cudaFree(d_DstKey));
+    free(h_DstVal);
+    free(h_DstKey);
+    free(h_SrcVal);
+    free(h_SrcKey);

-  exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/mergeSort/mergeSort.cu
+++ b/Samples/0_Introduction/mergeSort/mergeSort.cu
@ -39,491 +39,499 @@
 namespace cg = cooperative_groups;

 #include <helper_cuda.h>
+
 #include "mergeSort_common.h"

 ////////////////////////////////////////////////////////////////////////////////
 // Helper functions
 ////////////////////////////////////////////////////////////////////////////////
-static inline __host__ __device__ uint iDivUp(uint a, uint b) {
-  return ((a % b) == 0) ? (a / b) : (a / b + 1);
-}
+static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }

-static inline __host__ __device__ uint getSampleCount(uint dividend) {
-  return iDivUp(dividend, SAMPLE_STRIDE);
-}
+static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }

 #define W (sizeof(uint) * 8)
-static inline __device__ uint nextPowerOfTwo(uint x) {
-  /*
-      --x;
-      x |= x >> 1;
-      x |= x >> 2;
-      x |= x >> 4;
-      x |= x >> 8;
-      x |= x >> 16;
-      return ++x;
-  */
-  return 1U << (W - __clz(x - 1));
+static inline __device__ uint nextPowerOfTwo(uint x)
+{
+    /*
+        --x;
+        x |= x >> 1;
+        x |= x >> 2;
+        x |= x >> 4;
+        x |= x >> 8;
+        x |= x >> 16;
+        return ++x;
+    */
+    return 1U << (W - __clz(x - 1));
 }

-template <uint sortDir>
-static inline __device__ uint binarySearchInclusive(uint val, uint *data,
-                                                    uint L, uint stride) {
-  if (L == 0) {
-    return 0;
-  }
-
-  uint pos = 0;
-
-  for (; stride > 0; stride >>= 1) {
-    uint newPos = umin(pos + stride, L);
-
-    if ((sortDir && (data[newPos - 1] <= val)) ||
-        (!sortDir && (data[newPos - 1] >= val))) {
-      pos = newPos;
+template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
+{
+    if (L == 0) {
+        return 0;
    }
-  }

-  return pos;
+    uint pos = 0;
+
+    for (; stride > 0; stride >>= 1) {
+        uint newPos = umin(pos + stride, L);
+
+        if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
+            pos = newPos;
+        }
+    }
+
+    return pos;
 }

-template <uint sortDir>
-static inline __device__ uint binarySearchExclusive(uint val, uint *data,
-                                                    uint L, uint stride) {
-  if (L == 0) {
-    return 0;
-  }
-
-  uint pos = 0;
-
-  for (; stride > 0; stride >>= 1) {
-    uint newPos = umin(pos + stride, L);
-
-    if ((sortDir && (data[newPos - 1] < val)) ||
-        (!sortDir && (data[newPos - 1] > val))) {
-      pos = newPos;
+template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
+{
+    if (L == 0) {
+        return 0;
    }
-  }

-  return pos;
+    uint pos = 0;
+
+    for (; stride > 0; stride >>= 1) {
+        uint newPos = umin(pos + stride, L);
+
+        if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
+            pos = newPos;
+        }
+    }
+
+    return pos;
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Bottom-level merge sort (binary search-based)
 ////////////////////////////////////////////////////////////////////////////////
 template <uint sortDir>
-__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
-                                      uint *d_SrcKey, uint *d_SrcVal,
-                                      uint arrayLength) {
-  // Handle to thread block group
-  cg::thread_block cta = cg::this_thread_block();
-  __shared__ uint s_key[SHARED_SIZE_LIMIT];
-  __shared__ uint s_val[SHARED_SIZE_LIMIT];
+__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength)
+{
+    // Handle to thread block group
+    cg::thread_block cta = cg::this_thread_block();
+    __shared__ uint  s_key[SHARED_SIZE_LIMIT];
+    __shared__ uint  s_val[SHARED_SIZE_LIMIT];

-  d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  s_key[threadIdx.x + 0] = d_SrcKey[0];
-  s_val[threadIdx.x + 0] = d_SrcVal[0];
-  s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
-      d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
-  s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
-      d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
+    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    s_key[threadIdx.x + 0]                       = d_SrcKey[0];
+    s_val[threadIdx.x + 0]                       = d_SrcVal[0];
+    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
+    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];

-  for (uint stride = 1; stride < arrayLength; stride <<= 1) {
-    uint lPos = threadIdx.x & (stride - 1);
-    uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
-    uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
+    for (uint stride = 1; stride < arrayLength; stride <<= 1) {
+        uint  lPos    = threadIdx.x & (stride - 1);
+        uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
+        uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
+
+        cg::sync(cta);
+        uint keyA = baseKey[lPos + 0];
+        uint valA = baseVal[lPos + 0];
+        uint keyB = baseKey[lPos + stride];
+        uint valB = baseVal[lPos + stride];
+        uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
+        uint posB = binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) + lPos;
+
+        cg::sync(cta);
+        baseKey[posA] = keyA;
+        baseVal[posA] = valA;
+        baseKey[posB] = keyB;
+        baseVal[posB] = valB;
+    }

    cg::sync(cta);
-    uint keyA = baseKey[lPos + 0];
-    uint valA = baseVal[lPos + 0];
-    uint keyB = baseKey[lPos + stride];
-    uint valB = baseVal[lPos + stride];
-    uint posA =
-        binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) +
-        lPos;
-    uint posB =
-        binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) +
-        lPos;
-
-    cg::sync(cta);
-    baseKey[posA] = keyA;
-    baseVal[posA] = valA;
-    baseKey[posB] = keyB;
-    baseVal[posB] = valB;
-  }
-
-  cg::sync(cta);
-  d_DstKey[0] = s_key[threadIdx.x + 0];
-  d_DstVal[0] = s_val[threadIdx.x + 0];
-  d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
-      s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-  d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
-      s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+    d_DstKey[0]                       = s_key[threadIdx.x + 0];
+    d_DstVal[0]                       = s_val[threadIdx.x + 0];
+    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 }

-static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
-                            uint *d_SrcVal, uint batchSize, uint arrayLength,
-                            uint sortDir) {
-  if (arrayLength < 2) {
-    return;
-  }
+static void mergeSortShared(uint *d_DstKey,
+                            uint *d_DstVal,
+                            uint *d_SrcKey,
+                            uint *d_SrcVal,
+                            uint  batchSize,
+                            uint  arrayLength,
+                            uint  sortDir)
+{
+    if (arrayLength < 2) {
+        return;
+    }

-  assert(SHARED_SIZE_LIMIT % arrayLength == 0);
-  assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
-  uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
-  uint threadCount = SHARED_SIZE_LIMIT / 2;
+    assert(SHARED_SIZE_LIMIT % arrayLength == 0);
+    assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
+    uint blockCount  = batchSize * arrayLength / SHARED_SIZE_LIMIT;
+    uint threadCount = SHARED_SIZE_LIMIT / 2;

-  if (sortDir) {
-    mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
-    getLastCudaError("mergeSortShared<1><<<>>> failed\n");
-  } else {
-    mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
-    getLastCudaError("mergeSortShared<0><<<>>> failed\n");
-  }
+    if (sortDir) {
+        mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
+        getLastCudaError("mergeSortShared<1><<<>>> failed\n");
+    }
+    else {
+        mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
+        getLastCudaError("mergeSortShared<0><<<>>> failed\n");
+    }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 1: generate sample ranks
 ////////////////////////////////////////////////////////////////////////////////
 template <uint sortDir>
-__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
-                                          uint *d_SrcKey, uint stride, uint N,
-                                          uint threadCount) {
-  uint pos = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void
+generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount)
+{
+    uint pos = blockIdx.x * blockDim.x + threadIdx.x;

-  if (pos >= threadCount) {
-    return;
-  }
+    if (pos >= threadCount) {
+        return;
+    }

-  const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
-  const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
-  d_SrcKey += segmentBase;
-  d_RanksA += segmentBase / SAMPLE_STRIDE;
-  d_RanksB += segmentBase / SAMPLE_STRIDE;
+    const uint i           = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+    d_SrcKey += segmentBase;
+    d_RanksA += segmentBase / SAMPLE_STRIDE;
+    d_RanksB += segmentBase / SAMPLE_STRIDE;

-  const uint segmentElementsA = stride;
-  const uint segmentElementsB = umin(stride, N - segmentBase - stride);
-  const uint segmentSamplesA = getSampleCount(segmentElementsA);
-  const uint segmentSamplesB = getSampleCount(segmentElementsB);
+    const uint segmentElementsA = stride;
+    const uint segmentElementsB = umin(stride, N - segmentBase - stride);
+    const uint segmentSamplesA  = getSampleCount(segmentElementsA);
+    const uint segmentSamplesB  = getSampleCount(segmentElementsB);

-  if (i < segmentSamplesA) {
-    d_RanksA[i] = i * SAMPLE_STRIDE;
-    d_RanksB[i] = binarySearchExclusive<sortDir>(
-        d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB,
-        nextPowerOfTwo(segmentElementsB));
-  }
+    if (i < segmentSamplesA) {
+        d_RanksA[i] = i * SAMPLE_STRIDE;
+        d_RanksB[i] = binarySearchExclusive<sortDir>(
+            d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB));
+    }

-  if (i < segmentSamplesB) {
-    d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
-    d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
-        d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA,
-        nextPowerOfTwo(segmentElementsA));
-  }
+    if (i < segmentSamplesB) {
+        d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
+        d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
+            d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA));
+    }
 }

-static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
-                                uint stride, uint N, uint sortDir) {
-  uint lastSegmentElements = N % (2 * stride);
-  uint threadCount =
-      (lastSegmentElements > stride)
-          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
-          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir)
+{
+    uint lastSegmentElements = N % (2 * stride);
+    uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
+                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

-  if (sortDir) {
-    generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>(
-        d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
-    getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
-  } else {
-    generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>(
-        d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
-    getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
-  }
+    if (sortDir) {
+        generateSampleRanksKernel<1U>
+            <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
+        getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
+    }
+    else {
+        generateSampleRanksKernel<0U>
+            <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
+        getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
+    }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 2: generate sample ranks and indices
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
-                                           uint stride, uint N,
-                                           uint threadCount) {
-  uint pos = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount)
+{
+    uint pos = blockIdx.x * blockDim.x + threadIdx.x;

-  if (pos >= threadCount) {
-    return;
-  }
+    if (pos >= threadCount) {
+        return;
+    }

-  const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
-  const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
-  d_Ranks += (pos - i) * 2;
-  d_Limits += (pos - i) * 2;
+    const uint i           = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+    d_Ranks += (pos - i) * 2;
+    d_Limits += (pos - i) * 2;

-  const uint segmentElementsA = stride;
-  const uint segmentElementsB = umin(stride, N - segmentBase - stride);
-  const uint segmentSamplesA = getSampleCount(segmentElementsA);
-  const uint segmentSamplesB = getSampleCount(segmentElementsB);
+    const uint segmentElementsA = stride;
+    const uint segmentElementsB = umin(stride, N - segmentBase - stride);
+    const uint segmentSamplesA  = getSampleCount(segmentElementsA);
+    const uint segmentSamplesB  = getSampleCount(segmentElementsB);

-  if (i < segmentSamplesA) {
-    uint dstPos = binarySearchExclusive<1U>(
-                      d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB,
-                      nextPowerOfTwo(segmentSamplesB)) +
-                  i;
-    d_Limits[dstPos] = d_Ranks[i];
-  }
+    if (i < segmentSamplesA) {
+        uint dstPos = binarySearchExclusive<1U>(
+                          d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB))
+                    + i;
+        d_Limits[dstPos] = d_Ranks[i];
+    }

-  if (i < segmentSamplesB) {
-    uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i],
-                                            d_Ranks, segmentSamplesA,
-                                            nextPowerOfTwo(segmentSamplesA)) +
-                  i;
-    d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
-  }
+    if (i < segmentSamplesB) {
+        uint dstPos = binarySearchInclusive<1U>(
+                          d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA))
+                    + i;
+        d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
+    }
 }

-static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
-                                 uint *d_RanksA, uint *d_RanksB, uint stride,
-                                 uint N) {
-  uint lastSegmentElements = N % (2 * stride);
-  uint threadCount =
-      (lastSegmentElements > stride)
-          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
-          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N)
+{
+    uint lastSegmentElements = N % (2 * stride);
+    uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
+                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

-  mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
-      d_LimitsA, d_RanksA, stride, N, threadCount);
-  getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
+    mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsA, d_RanksA, stride, N, threadCount);
+    getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");

-  mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
-      d_LimitsB, d_RanksB, stride, N, threadCount);
-  getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
+    mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsB, d_RanksB, stride, N, threadCount);
+    getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 3: merge elementary intervals
 ////////////////////////////////////////////////////////////////////////////////
 template <uint sortDir>
-inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
-                             uint *srcAVal, uint *srcBKey, uint *srcBVal,
-                             uint lenA, uint nPowTwoLenA, uint lenB,
-                             uint nPowTwoLenB, cg::thread_block cta) {
-  uint keyA, valA, keyB, valB, dstPosA, dstPosB;
+inline __device__ void merge(uint            *dstKey,
+                             uint            *dstVal,
+                             uint            *srcAKey,
+                             uint            *srcAVal,
+                             uint            *srcBKey,
+                             uint            *srcBVal,
+                             uint             lenA,
+                             uint             nPowTwoLenA,
+                             uint             lenB,
+                             uint             nPowTwoLenB,
+                             cg::thread_block cta)
+{
+    uint keyA, valA, keyB, valB, dstPosA, dstPosB;

-  if (threadIdx.x < lenA) {
-    keyA = srcAKey[threadIdx.x];
-    valA = srcAVal[threadIdx.x];
-    dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) +
-              threadIdx.x;
-  }
+    if (threadIdx.x < lenA) {
+        keyA    = srcAKey[threadIdx.x];
+        valA    = srcAVal[threadIdx.x];
+        dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
+    }

-  if (threadIdx.x < lenB) {
-    keyB = srcBKey[threadIdx.x];
-    valB = srcBVal[threadIdx.x];
-    dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) +
-              threadIdx.x;
-  }
+    if (threadIdx.x < lenB) {
+        keyB    = srcBKey[threadIdx.x];
+        valB    = srcBVal[threadIdx.x];
+        dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
+    }

-  cg::sync(cta);
+    cg::sync(cta);

-  if (threadIdx.x < lenA) {
-    dstKey[dstPosA] = keyA;
-    dstVal[dstPosA] = valA;
-  }
+    if (threadIdx.x < lenA) {
+        dstKey[dstPosA] = keyA;
+        dstVal[dstPosA] = valA;
+    }

-  if (threadIdx.x < lenB) {
-    dstKey[dstPosB] = keyB;
-    dstVal[dstPosB] = valB;
-  }
+    if (threadIdx.x < lenB) {
+        dstKey[dstPosB] = keyB;
+        dstVal[dstPosB] = valB;
+    }
 }

 template <uint sortDir>
-__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
-                                               uint *d_SrcKey, uint *d_SrcVal,
-                                               uint *d_LimitsA, uint *d_LimitsB,
-                                               uint stride, uint N) {
-  // Handle to thread block group
-  cg::thread_block cta = cg::this_thread_block();
-  __shared__ uint s_key[2 * SAMPLE_STRIDE];
-  __shared__ uint s_val[2 * SAMPLE_STRIDE];
+__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
+                                               uint *d_DstVal,
+                                               uint *d_SrcKey,
+                                               uint *d_SrcVal,
+                                               uint *d_LimitsA,
+                                               uint *d_LimitsB,
+                                               uint  stride,
+                                               uint  N)
+{
+    // Handle to thread block group
+    cg::thread_block cta = cg::this_thread_block();
+    __shared__ uint  s_key[2 * SAMPLE_STRIDE];
+    __shared__ uint  s_val[2 * SAMPLE_STRIDE];

-  const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
-  const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
-  d_SrcKey += segmentBase;
-  d_SrcVal += segmentBase;
-  d_DstKey += segmentBase;
-  d_DstVal += segmentBase;
+    const uint intervalI   = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
+    const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
+    d_SrcKey += segmentBase;
+    d_SrcVal += segmentBase;
+    d_DstKey += segmentBase;
+    d_DstVal += segmentBase;

-  // Set up threadblock-wide parameters
-  __shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
+    // Set up threadblock-wide parameters
+    __shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;

-  if (threadIdx.x == 0) {
-    uint segmentElementsA = stride;
-    uint segmentElementsB = umin(stride, N - segmentBase - stride);
-    uint segmentSamplesA = getSampleCount(segmentElementsA);
-    uint segmentSamplesB = getSampleCount(segmentElementsB);
-    uint segmentSamples = segmentSamplesA + segmentSamplesB;
+    if (threadIdx.x == 0) {
+        uint segmentElementsA = stride;
+        uint segmentElementsB = umin(stride, N - segmentBase - stride);
+        uint segmentSamplesA  = getSampleCount(segmentElementsA);
+        uint segmentSamplesB  = getSampleCount(segmentElementsB);
+        uint segmentSamples   = segmentSamplesA + segmentSamplesB;

-    startSrcA = d_LimitsA[blockIdx.x];
-    startSrcB = d_LimitsB[blockIdx.x];
-    uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
-                                                    : segmentElementsA;
-    uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
-                                                    : segmentElementsB;
-    lenSrcA = endSrcA - startSrcA;
-    lenSrcB = endSrcB - startSrcB;
-    startDstA = startSrcA + startSrcB;
-    startDstB = startDstA + lenSrcA;
-  }
-
-  // Load main input data
-  cg::sync(cta);
-
-  if (threadIdx.x < lenSrcA) {
-    s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
-    s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
-  }
-
-  if (threadIdx.x < lenSrcB) {
-    s_key[threadIdx.x + SAMPLE_STRIDE] =
-        d_SrcKey[stride + startSrcB + threadIdx.x];
-    s_val[threadIdx.x + SAMPLE_STRIDE] =
-        d_SrcVal[stride + startSrcB + threadIdx.x];
-  }
-
-  // Merge data in shared memory
-  cg::sync(cta);
-  merge<sortDir>(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE,
-                 s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB,
-                 SAMPLE_STRIDE, cta);
-
-  // Store merged data
-  cg::sync(cta);
-
-  if (threadIdx.x < lenSrcA) {
-    d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
-    d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
-  }
-
-  if (threadIdx.x < lenSrcB) {
-    d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
-    d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
-  }
-}
-
-static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
-                                     uint *d_SrcKey, uint *d_SrcVal,
-                                     uint *d_LimitsA, uint *d_LimitsB,
-                                     uint stride, uint N, uint sortDir) {
-  uint lastSegmentElements = N % (2 * stride);
-  uint mergePairs = (lastSegmentElements > stride)
-                        ? getSampleCount(N)
-                        : (N - lastSegmentElements) / SAMPLE_STRIDE;
-
-  if (sortDir) {
-    mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
-        N);
-    getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
-  } else {
-    mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
-        N);
-    getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
-  }
-}
-
-extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
-                                  uint *d_SrcKey, uint *d_SrcVal,
-                                  uint batchSize, uint arrayLength,
-                                  uint sortDir);
-
-extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
-                                                uint *d_SrcKey, uint *d_SrcVal,
-                                                uint *d_LimitsA,
-                                                uint *d_LimitsB, uint stride,
-                                                uint N, uint sortDir);
-
-static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
-static const uint MAX_SAMPLE_COUNT = 32768;
-
-extern "C" void initMergeSort(void) {
-  checkCudaErrors(
-      cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
-  checkCudaErrors(
-      cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
-  checkCudaErrors(
-      cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
-  checkCudaErrors(
-      cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
-}
-
-extern "C" void closeMergeSort(void) {
-  checkCudaErrors(cudaFree(d_RanksA));
-  checkCudaErrors(cudaFree(d_RanksB));
-  checkCudaErrors(cudaFree(d_LimitsB));
-  checkCudaErrors(cudaFree(d_LimitsA));
-}
-
-extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
-                          uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal,
-                          uint N, uint sortDir) {
-  uint stageCount = 0;
-
-  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
-    ;
-
-  uint *ikey, *ival, *okey, *oval;
-
-  if (stageCount & 1) {
-    ikey = d_BufKey;
-    ival = d_BufVal;
-    okey = d_DstKey;
-    oval = d_DstVal;
-  } else {
-    ikey = d_DstKey;
-    ival = d_DstVal;
-    okey = d_BufKey;
-    oval = d_BufVal;
-  }
-
-  assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
-  assert(N % SHARED_SIZE_LIMIT == 0);
-  mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT,
-                  SHARED_SIZE_LIMIT, sortDir);
-
-  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
-    uint lastSegmentElements = N % (2 * stride);
-
-    // Find sample ranks and prepare for limiters merge
-    generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
-
-    // Merge ranks and indices
-    mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
-
-    // Merge elementary intervals
-    mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB,
-                             stride, N, sortDir);
-
-    if (lastSegmentElements <= stride) {
-      // Last merge segment consists of a single array which just needs to be
-      // passed through
-      checkCudaErrors(cudaMemcpy(
-          okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
-          lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
-      checkCudaErrors(cudaMemcpy(
-          oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
-          lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
+        startSrcA    = d_LimitsA[blockIdx.x];
+        startSrcB    = d_LimitsB[blockIdx.x];
+        uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
+        uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
+        lenSrcA      = endSrcA - startSrcA;
+        lenSrcB      = endSrcB - startSrcB;
+        startDstA    = startSrcA + startSrcB;
+        startDstB    = startDstA + lenSrcA;
    }

-    uint *t;
-    t = ikey;
-    ikey = okey;
-    okey = t;
-    t = ival;
-    ival = oval;
-    oval = t;
-  }
+    // Load main input data
+    cg::sync(cta);
+
+    if (threadIdx.x < lenSrcA) {
+        s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
+        s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
+    }
+
+    if (threadIdx.x < lenSrcB) {
+        s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
+        s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
+    }
+
+    // Merge data in shared memory
+    cg::sync(cta);
+    merge<sortDir>(s_key,
+                   s_val,
+                   s_key + 0,
+                   s_val + 0,
+                   s_key + SAMPLE_STRIDE,
+                   s_val + SAMPLE_STRIDE,
+                   lenSrcA,
+                   SAMPLE_STRIDE,
+                   lenSrcB,
+                   SAMPLE_STRIDE,
+                   cta);
+
+    // Store merged data
+    cg::sync(cta);
+
+    if (threadIdx.x < lenSrcA) {
+        d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
+        d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
+    }
+
+    if (threadIdx.x < lenSrcB) {
+        d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
+        d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
+    }
+}
+
+static void mergeElementaryIntervals(uint *d_DstKey,
+                                     uint *d_DstVal,
+                                     uint *d_SrcKey,
+                                     uint *d_SrcVal,
+                                     uint *d_LimitsA,
+                                     uint *d_LimitsB,
+                                     uint  stride,
+                                     uint  N,
+                                     uint  sortDir)
+{
+    uint lastSegmentElements = N % (2 * stride);
+    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
+
+    if (sortDir) {
+        mergeElementaryIntervalsKernel<1U>
+            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
+        getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
+    }
+    else {
+        mergeElementaryIntervalsKernel<0U>
+            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
+        getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
+    }
+}
+
+extern "C" void bitonicSortShared(uint *d_DstKey,
+                                  uint *d_DstVal,
+                                  uint *d_SrcKey,
+                                  uint *d_SrcVal,
+                                  uint  batchSize,
+                                  uint  arrayLength,
+                                  uint  sortDir);
+
+extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
+                                                uint *d_DstVal,
+                                                uint *d_SrcKey,
+                                                uint *d_SrcVal,
+                                                uint *d_LimitsA,
+                                                uint *d_LimitsB,
+                                                uint  stride,
+                                                uint  N,
+                                                uint  sortDir);
+
+static uint      *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
+static const uint MAX_SAMPLE_COUNT = 32768;
+
+extern "C" void initMergeSort(void)
+{
+    checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
+}
+
+extern "C" void closeMergeSort(void)
+{
+    checkCudaErrors(cudaFree(d_RanksA));
+    checkCudaErrors(cudaFree(d_RanksB));
+    checkCudaErrors(cudaFree(d_LimitsB));
+    checkCudaErrors(cudaFree(d_LimitsA));
+}
+
+extern "C" void mergeSort(uint *d_DstKey,
+                          uint *d_DstVal,
+                          uint *d_BufKey,
+                          uint *d_BufVal,
+                          uint *d_SrcKey,
+                          uint *d_SrcVal,
+                          uint  N,
+                          uint  sortDir)
+{
+    uint stageCount = 0;
+
+    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
+        ;
+
+    uint *ikey, *ival, *okey, *oval;
+
+    if (stageCount & 1) {
+        ikey = d_BufKey;
+        ival = d_BufVal;
+        okey = d_DstKey;
+        oval = d_DstVal;
+    }
+    else {
+        ikey = d_DstKey;
+        ival = d_DstVal;
+        okey = d_BufKey;
+        oval = d_BufVal;
+    }
+
+    assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
+    assert(N % SHARED_SIZE_LIMIT == 0);
+    mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);
+
+    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
+        uint lastSegmentElements = N % (2 * stride);
+
+        // Find sample ranks and prepare for limiters merge
+        generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
+
+        // Merge ranks and indices
+        mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
+
+        // Merge elementary intervals
+        mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);
+
+        if (lastSegmentElements <= stride) {
+            // Last merge segment consists of a single array which just needs to be
+            // passed through
+            checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements),
+                                       ikey + (N - lastSegmentElements),
+                                       lastSegmentElements * sizeof(uint),
+                                       cudaMemcpyDeviceToDevice));
+            checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements),
+                                       ival + (N - lastSegmentElements),
+                                       lastSegmentElements * sizeof(uint),
+                                       cudaMemcpyDeviceToDevice));
+        }
+
+        uint *t;
+        t    = ikey;
+        ikey = okey;
+        okey = t;
+        t    = ival;
+        ival = oval;
+        oval = t;
+    }
 }
--- a/Samples/0_Introduction/mergeSort/mergeSort_common.h
+++ b/Samples/0_Introduction/mergeSort/mergeSort_common.h
@ -31,19 +31,17 @@
 typedef unsigned int uint;

 #define SHARED_SIZE_LIMIT 1024U
-#define SAMPLE_STRIDE 128
+#define SAMPLE_STRIDE     128

 ////////////////////////////////////////////////////////////////////////////////
 // Extensive sort validation routine
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
-                                   uint arrayLength, uint numValues,
-                                   uint sortDir);
+extern "C" uint
+validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir);

 extern "C" void fillValues(uint *val, uint N);

-extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
-                                    uint batchSize, uint arrayLength);
+extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength);

 ////////////////////////////////////////////////////////////////////////////////
 // CUDA merge sort
@ -52,13 +50,11 @@ extern "C" void initMergeSort(void);

 extern "C" void closeMergeSort(void);

-extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey,
-                          uint *bufVal, uint *srcKey, uint *srcVal, uint N,
-                          uint sortDir);
+extern "C" void
+mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);

 ////////////////////////////////////////////////////////////////////////////////
 // CPU "emulation"
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
-                              uint *bufVal, uint *srcKey, uint *srcVal, uint N,
-                              uint sortDir);
+extern "C" void
+mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
--- a/Samples/0_Introduction/mergeSort/mergeSort_host.cpp
+++ b/Samples/0_Introduction/mergeSort/mergeSort_host.cpp
@ -29,329 +29,335 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include "mergeSort_common.h"

 ////////////////////////////////////////////////////////////////////////////////
 // Helper functions
 ////////////////////////////////////////////////////////////////////////////////
-static void checkOrder(uint *data, uint N, uint sortDir) {
-  if (N <= 1) {
-    return;
-  }
-
-  for (uint i = 0; i < N - 1; i++)
-    if ((sortDir && (data[i] > data[i + 1])) ||
-        (!sortDir && (data[i] < data[i + 1]))) {
-      fprintf(stderr, "checkOrder() failed!!!\n");
-      exit(EXIT_FAILURE);
+static void checkOrder(uint *data, uint N, uint sortDir)
+{
+    if (N <= 1) {
+        return;
    }
+
+    for (uint i = 0; i < N - 1; i++)
+        if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) {
+            fprintf(stderr, "checkOrder() failed!!!\n");
+            exit(EXIT_FAILURE);
+        }
 }

 static uint umin(uint a, uint b) { return (a <= b) ? a : b; }

-static uint getSampleCount(uint dividend) {
-  return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1)
-                                           : (dividend / SAMPLE_STRIDE);
+static uint getSampleCount(uint dividend)
+{
+    return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE);
 }

-static uint nextPowerOfTwo(uint x) {
-  --x;
-  x |= x >> 1;
-  x |= x >> 2;
-  x |= x >> 4;
-  x |= x >> 8;
-  x |= x >> 16;
-  return ++x;
+static uint nextPowerOfTwo(uint x)
+{
+    --x;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    return ++x;
 }

-static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
-  if (L == 0) {
-    return 0;
-  }
-
-  uint pos = 0;
-
-  for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
-    uint newPos = umin(pos + stride, L);
-
-    if ((sortDir && (data[newPos - 1] <= val)) ||
-        (!sortDir && (data[newPos - 1] >= val))) {
-      pos = newPos;
+static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
+{
+    if (L == 0) {
+        return 0;
    }
-  }

-  return pos;
+    uint pos = 0;
+
+    for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
+        uint newPos = umin(pos + stride, L);
+
+        if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
+            pos = newPos;
+        }
+    }
+
+    return pos;
 }

-static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
-  if (L == 0) {
-    return 0;
-  }
-
-  uint pos = 0;
-
-  for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
-    uint newPos = umin(pos + stride, L);
-
-    if ((sortDir && (data[newPos - 1] < val)) ||
-        (!sortDir && (data[newPos - 1] > val))) {
-      pos = newPos;
+static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
+{
+    if (L == 0) {
+        return 0;
    }
-  }

-  return pos;
+    uint pos = 0;
+
+    for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
+        uint newPos = umin(pos + stride, L);
+
+        if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
+            pos = newPos;
+        }
+    }
+
+    return pos;
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 1: find sample ranks in each segment
 ////////////////////////////////////////////////////////////////////////////////
-static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
-                                uint stride, uint N, uint sortDir) {
-  uint lastSegmentElements = N % (2 * stride);
-  uint sampleCount =
-      (lastSegmentElements > stride)
-          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
-          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir)
+{
+    uint lastSegmentElements = N % (2 * stride);
+    uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
+                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

-  for (uint pos = 0; pos < sampleCount; pos++) {
-    const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
-    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+    for (uint pos = 0; pos < sampleCount; pos++) {
+        const uint i           = pos & ((stride / SAMPLE_STRIDE) - 1);
+        const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);

-    const uint lenA = stride;
-    const uint lenB = umin(stride, N - segmentBase - stride);
-    const uint nA = stride / SAMPLE_STRIDE;
-    const uint nB = getSampleCount(lenB);
+        const uint lenA = stride;
+        const uint lenB = umin(stride, N - segmentBase - stride);
+        const uint nA   = stride / SAMPLE_STRIDE;
+        const uint nB   = getSampleCount(lenB);

-    if (i < nA) {
-      ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
-      ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] =
-          binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE],
-                                srcKey + segmentBase + stride, lenB, sortDir);
+        if (i < nA) {
+            ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
+            ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive(
+                srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir);
+        }
+
+        if (i < nB) {
+            ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
+            ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive(
+                srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir);
+        }
    }
-
-    if (i < nB) {
-      ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
-      ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] =
-          binarySearchInclusive(
-              srcKey[segmentBase + stride + i * SAMPLE_STRIDE],
-              srcKey + segmentBase, lenA, sortDir);
-    }
-  }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 2: merge ranks and indices to derive elementary intervals
 ////////////////////////////////////////////////////////////////////////////////
-static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
-                                 uint N) {
-  uint lastSegmentElements = N % (2 * stride);
-  uint sampleCount =
-      (lastSegmentElements > stride)
-          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
-          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
+static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
+{
+    uint lastSegmentElements = N % (2 * stride);
+    uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
+                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);

-  for (uint pos = 0; pos < sampleCount; pos++) {
-    const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
-    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+    for (uint pos = 0; pos < sampleCount; pos++) {
+        const uint i           = pos & ((stride / SAMPLE_STRIDE) - 1);
+        const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);

-    const uint lenA = stride;
-    const uint lenB = umin(stride, N - segmentBase - stride);
-    const uint nA = stride / SAMPLE_STRIDE;
-    const uint nB = getSampleCount(lenB);
+        const uint lenA = stride;
+        const uint lenB = umin(stride, N - segmentBase - stride);
+        const uint nA   = stride / SAMPLE_STRIDE;
+        const uint nB   = getSampleCount(lenB);

-    if (i < nA) {
-      uint dstPosA =
-          binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i],
-                                ranks + (segmentBase + stride) / SAMPLE_STRIDE,
-                                nB, 1) +
-          i;
-      assert(dstPosA < nA + nB);
-      limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
-          ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
+        if (i < nA) {
+            uint dstPosA =
+                binarySearchExclusive(
+                    ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1)
+                + i;
+            assert(dstPosA < nA + nB);
+            limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
+        }
+
+        if (i < nB) {
+            uint dstPosA =
+                binarySearchInclusive(
+                    ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1)
+                + i;
+            assert(dstPosA < nA + nB);
+            limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
+        }
    }
-
-    if (i < nB) {
-      uint dstPosA = binarySearchInclusive(
-                         ranks[(segmentBase + stride) / SAMPLE_STRIDE + i],
-                         ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) +
-                     i;
-      assert(dstPosA < nA + nB);
-      limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
-          ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
-    }
-  }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
 ////////////////////////////////////////////////////////////////////////////////
-static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
-                  uint *srcBKey, uint *srcBVal, uint lenA, uint lenB,
-                  uint sortDir) {
-  checkOrder(srcAKey, lenA, sortDir);
-  checkOrder(srcBKey, lenB, sortDir);
+static void merge(uint *dstKey,
+                  uint *dstVal,
+                  uint *srcAKey,
+                  uint *srcAVal,
+                  uint *srcBKey,
+                  uint *srcBVal,
+                  uint  lenA,
+                  uint  lenB,
+                  uint  sortDir)
+{
+    checkOrder(srcAKey, lenA, sortDir);
+    checkOrder(srcBKey, lenB, sortDir);

-  for (uint i = 0; i < lenA; i++) {
-    uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
-    assert(dstPos < lenA + lenB);
-    dstKey[dstPos] = srcAKey[i];
-    dstVal[dstPos] = srcAVal[i];
-  }
+    for (uint i = 0; i < lenA; i++) {
+        uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
+        assert(dstPos < lenA + lenB);
+        dstKey[dstPos] = srcAKey[i];
+        dstVal[dstPos] = srcAVal[i];
+    }

-  for (uint i = 0; i < lenB; i++) {
-    uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
-    assert(dstPos < lenA + lenB);
-    dstKey[dstPos] = srcBKey[i];
-    dstVal[dstPos] = srcBVal[i];
-  }
+    for (uint i = 0; i < lenB; i++) {
+        uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
+        assert(dstPos < lenA + lenB);
+        dstKey[dstPos] = srcBKey[i];
+        dstVal[dstPos] = srcBVal[i];
+    }
 }

-static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
-                                     uint *srcVal, uint *limitsA, uint *limitsB,
-                                     uint stride, uint N, uint sortDir) {
-  uint lastSegmentElements = N % (2 * stride);
-  uint mergePairs = (lastSegmentElements > stride)
-                        ? getSampleCount(N)
-                        : (N - lastSegmentElements) / SAMPLE_STRIDE;
+static void mergeElementaryIntervals(uint *dstKey,
+                                     uint *dstVal,
+                                     uint *srcKey,
+                                     uint *srcVal,
+                                     uint *limitsA,
+                                     uint *limitsB,
+                                     uint  stride,
+                                     uint  N,
+                                     uint  sortDir)
+{
+    uint lastSegmentElements = N % (2 * stride);
+    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;

-  for (uint pos = 0; pos < mergePairs; pos++) {
-    uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
-    uint segmentBase = (pos - i) * SAMPLE_STRIDE;
+    for (uint pos = 0; pos < mergePairs; pos++) {
+        uint i           = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
+        uint segmentBase = (pos - i) * SAMPLE_STRIDE;

-    const uint lenA = stride;
-    const uint lenB = umin(stride, N - segmentBase - stride);
-    const uint nA = stride / SAMPLE_STRIDE;
-    const uint nB = getSampleCount(lenB);
-    const uint n = nA + nB;
+        const uint lenA = stride;
+        const uint lenB = umin(stride, N - segmentBase - stride);
+        const uint nA   = stride / SAMPLE_STRIDE;
+        const uint nB   = getSampleCount(lenB);
+        const uint n    = nA + nB;

-    const uint startPosA = limitsA[pos];
-    const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA;
-    const uint startPosB = limitsB[pos];
-    const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB;
-    const uint startPosDst = startPosA + startPosB;
+        const uint startPosA   = limitsA[pos];
+        const uint endPosA     = (i + 1 < n) ? limitsA[pos + 1] : lenA;
+        const uint startPosB   = limitsB[pos];
+        const uint endPosB     = (i + 1 < n) ? limitsB[pos + 1] : lenB;
+        const uint startPosDst = startPosA + startPosB;

-    assert(startPosA <= endPosA && endPosA <= lenA);
-    assert(startPosB <= endPosB && endPosB <= lenB);
-    assert((endPosA - startPosA) <= SAMPLE_STRIDE);
-    assert((endPosB - startPosB) <= SAMPLE_STRIDE);
+        assert(startPosA <= endPosA && endPosA <= lenA);
+        assert(startPosB <= endPosB && endPosB <= lenB);
+        assert((endPosA - startPosA) <= SAMPLE_STRIDE);
+        assert((endPosB - startPosB) <= SAMPLE_STRIDE);

-    merge(dstKey + segmentBase + startPosDst,
-          dstVal + segmentBase + startPosDst,
-          (srcKey + segmentBase + 0) + startPosA,
-          (srcVal + segmentBase + 0) + startPosA,
-          (srcKey + segmentBase + stride) + startPosB,
-          (srcVal + segmentBase + stride) + startPosB, endPosA - startPosA,
-          endPosB - startPosB, sortDir);
-  }
+        merge(dstKey + segmentBase + startPosDst,
+              dstVal + segmentBase + startPosDst,
+              (srcKey + segmentBase + 0) + startPosA,
+              (srcVal + segmentBase + 0) + startPosA,
+              (srcKey + segmentBase + stride) + startPosB,
+              (srcVal + segmentBase + stride) + startPosB,
+              endPosA - startPosA,
+              endPosB - startPosB,
+              sortDir);
+    }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Retarded bubble sort
 ////////////////////////////////////////////////////////////////////////////////
-static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
-  if (N <= 1) {
-    return;
-  }
-
-  for (uint bottom = 0; bottom < N - 1; bottom++) {
-    uint savePos = bottom;
-    uint saveKey = key[bottom];
-
-    for (uint i = bottom + 1; i < N; i++)
-      if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
-        savePos = i;
-        saveKey = key[i];
-      }
-
-    if (savePos != bottom) {
-      uint t;
-      t = key[savePos];
-      key[savePos] = key[bottom];
-      key[bottom] = t;
-      t = val[savePos];
-      val[savePos] = val[bottom];
-      val[bottom] = t;
+static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
+{
+    if (N <= 1) {
+        return;
+    }
+
+    for (uint bottom = 0; bottom < N - 1; bottom++) {
+        uint savePos = bottom;
+        uint saveKey = key[bottom];
+
+        for (uint i = bottom + 1; i < N; i++)
+            if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
+                savePos = i;
+                saveKey = key[i];
+            }
+
+        if (savePos != bottom) {
+            uint t;
+            t            = key[savePos];
+            key[savePos] = key[bottom];
+            key[bottom]  = t;
+            t            = val[savePos];
+            val[savePos] = val[bottom];
+            val[bottom]  = t;
+        }
    }
-  }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Interface function
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
-                              uint *bufVal, uint *srcKey, uint *srcVal, uint N,
-                              uint sortDir) {
-  uint *ikey, *ival, *okey, *oval;
-  uint stageCount = 0;
+extern "C" void
+mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir)
+{
+    uint *ikey, *ival, *okey, *oval;
+    uint  stageCount = 0;

-  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
-    ;
+    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
+        ;

-  if (stageCount & 1) {
-    ikey = bufKey;
-    ival = bufVal;
-    okey = dstKey;
-    oval = dstVal;
-  } else {
-    ikey = dstKey;
-    ival = dstVal;
-    okey = bufKey;
-    oval = bufVal;
-  }
-
-  printf("Bottom-level sort...\n");
-  memcpy(ikey, srcKey, N * sizeof(uint));
-  memcpy(ival, srcVal, N * sizeof(uint));
-
-  for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
-    bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos),
-               sortDir);
-  }
-
-  printf("Merge...\n");
-  uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
-  uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
-  uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
-  uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
-  memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
-  memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
-  memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
-  memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
-
-  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
-    uint lastSegmentElements = N % (2 * stride);
-
-    // Find sample ranks and prepare for limiters merge
-    generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
-
-    // Merge ranks and indices
-    mergeRanksAndIndices(limitsA, ranksA, stride, N);
-    mergeRanksAndIndices(limitsB, ranksB, stride, N);
-
-    // Merge elementary intervals
-    mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride,
-                             N, sortDir);
-
-    if (lastSegmentElements <= stride) {
-      // Last merge segment consists of a single array which just needs to be
-      // passed through
-      memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
-             lastSegmentElements * sizeof(uint));
-      memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
-             lastSegmentElements * sizeof(uint));
+    if (stageCount & 1) {
+        ikey = bufKey;
+        ival = bufVal;
+        okey = dstKey;
+        oval = dstVal;
+    }
+    else {
+        ikey = dstKey;
+        ival = dstVal;
+        okey = bufKey;
+        oval = bufVal;
    }

-    uint *t;
-    t = ikey;
-    ikey = okey;
-    okey = t;
-    t = ival;
-    ival = oval;
-    oval = t;
-  }
+    printf("Bottom-level sort...\n");
+    memcpy(ikey, srcKey, N * sizeof(uint));
+    memcpy(ival, srcVal, N * sizeof(uint));

-  free(limitsB);
-  free(limitsA);
-  free(ranksB);
-  free(ranksA);
+    for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
+        bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir);
+    }
+
+    printf("Merge...\n");
+    uint *ranksA  = (uint *)malloc(getSampleCount(N) * sizeof(uint));
+    uint *ranksB  = (uint *)malloc(getSampleCount(N) * sizeof(uint));
+    uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
+    uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
+    memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
+    memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
+    memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
+    memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
+
+    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
+        uint lastSegmentElements = N % (2 * stride);
+
+        // Find sample ranks and prepare for limiters merge
+        generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
+
+        // Merge ranks and indices
+        mergeRanksAndIndices(limitsA, ranksA, stride, N);
+        mergeRanksAndIndices(limitsB, ranksB, stride, N);
+
+        // Merge elementary intervals
+        mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir);
+
+        if (lastSegmentElements <= stride) {
+            // Last merge segment consists of a single array which just needs to be
+            // passed through
+            memcpy(
+                okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
+            memcpy(
+                oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
+        }
+
+        uint *t;
+        t    = ikey;
+        ikey = okey;
+        okey = t;
+        t    = ival;
+        ival = oval;
+        oval = t;
+    }
+
+    free(limitsB);
+    free(limitsA);
+    free(ranksB);
+    free(ranksA);
 }
--- a/Samples/0_Introduction/mergeSort/mergeSort_validate.cpp
+++ b/Samples/0_Introduction/mergeSort/mergeSort_validate.cpp
@ -29,104 +29,100 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include "mergeSort_common.h"

 ////////////////////////////////////////////////////////////////////////////////
 // Validate sorted keys array (check for integrity and proper order)
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
-                                   uint arrayLength, uint numValues,
-                                   uint sortDir) {
-  uint *srcHist;
-  uint *resHist;
+extern "C" uint
+validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir)
+{
+    uint *srcHist;
+    uint *resHist;

-  if (arrayLength < 2) {
-    printf("validateSortedKeys(): arrays too short, exiting...\n");
-    return 1;
-  }
-
-  printf("...inspecting keys array: ");
-  srcHist = (uint *)malloc(numValues * sizeof(uint));
-  resHist = (uint *)malloc(numValues * sizeof(uint));
-
-  int flag = 1;
-
-  for (uint j = 0; j < batchSize;
-       j++, srcKey += arrayLength, resKey += arrayLength) {
-    // Build histograms for keys arrays
-    memset(srcHist, 0, numValues * sizeof(uint));
-    memset(resHist, 0, numValues * sizeof(uint));
-
-    for (uint i = 0; i < arrayLength; i++) {
-      if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
-        srcHist[srcKey[i]]++;
-        resHist[resKey[i]]++;
-      } else {
-        fprintf(
-            stderr,
-            "***Set %u source/result key arrays are not limited properly***\n",
-            j);
-        flag = 0;
-        goto brk;
-      }
+    if (arrayLength < 2) {
+        printf("validateSortedKeys(): arrays too short, exiting...\n");
+        return 1;
    }

-    // Compare the histograms
-    for (uint i = 0; i < numValues; i++)
-      if (srcHist[i] != resHist[i]) {
-        fprintf(stderr,
-                "***Set %u source/result keys histograms do not match***\n", j);
-        flag = 0;
-        goto brk;
-      }
+    printf("...inspecting keys array: ");
+    srcHist = (uint *)malloc(numValues * sizeof(uint));
+    resHist = (uint *)malloc(numValues * sizeof(uint));

-    // Finally check the ordering
-    for (uint i = 0; i < arrayLength - 1; i++)
-      if ((sortDir && (resKey[i] > resKey[i + 1])) ||
-          (!sortDir && (resKey[i] < resKey[i + 1]))) {
-        fprintf(stderr,
-                "***Set %u result key array is not ordered properly***\n", j);
-        flag = 0;
-        goto brk;
-      }
-  }
+    int flag = 1;
+
+    for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) {
+        // Build histograms for keys arrays
+        memset(srcHist, 0, numValues * sizeof(uint));
+        memset(resHist, 0, numValues * sizeof(uint));
+
+        for (uint i = 0; i < arrayLength; i++) {
+            if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
+                srcHist[srcKey[i]]++;
+                resHist[resKey[i]]++;
+            }
+            else {
+                fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j);
+                flag = 0;
+                goto brk;
+            }
+        }
+
+        // Compare the histograms
+        for (uint i = 0; i < numValues; i++)
+            if (srcHist[i] != resHist[i]) {
+                fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j);
+                flag = 0;
+                goto brk;
+            }
+
+        // Finally check the ordering
+        for (uint i = 0; i < arrayLength - 1; i++)
+            if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) {
+                fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j);
+                flag = 0;
+                goto brk;
+            }
+    }

 brk:
-  free(resHist);
-  free(srcHist);
+    free(resHist);
+    free(srcHist);

-  if (flag) printf("OK\n");
+    if (flag)
+        printf("OK\n");

-  return flag;
+    return flag;
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Value validation / stability check routines
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void fillValues(uint *val, uint N) {
-  for (uint i = 0; i < N; i++) val[i] = i;
+extern "C" void fillValues(uint *val, uint N)
+{
+    for (uint i = 0; i < N; i++)
+        val[i] = i;
 }

-extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
-                                    uint batchSize, uint arrayLength) {
-  int correctFlag = 1, stableFlag = 1;
+extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength)
+{
+    int correctFlag = 1, stableFlag = 1;

-  printf("...inspecting keys and values array: ");
+    printf("...inspecting keys and values array: ");

-  for (uint i = 0; i < batchSize;
-       i++, resKey += arrayLength, resVal += arrayLength) {
-    for (uint j = 0; j < arrayLength; j++) {
-      if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0;
+    for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) {
+        for (uint j = 0; j < arrayLength; j++) {
+            if (resKey[j] != srcKey[resVal[j]])
+                correctFlag = 0;

-      if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) &&
-          (resVal[j] > resVal[j + 1]))
-        stableFlag = 0;
+            if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1]))
+                stableFlag = 0;
+        }
    }
-  }

-  printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
-  printf(stableFlag ? "...stability property: stable!\n"
-                    : "...stability property: NOT stable\n");
+    printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
+    printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n");

-  return correctFlag;
+    return correctFlag;
 }
--- a/Samples/0_Introduction/simpleAWBarrier/simpleAWBarrier.cu
+++ b/Samples/0_Introduction/simpleAWBarrier/simpleAWBarrier.cu
@ -29,106 +29,105 @@
 #include <stdio.h>

 // Includes CUDA
-#include <cuda_runtime.h>
-#include <cuda/barrier>
 #include <cooperative_groups.h>
+#include <cuda/barrier>
+#include <cuda_runtime.h>

 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h

 // CUDA helper functions
-#include <helper_cuda.h>  // helper functions for CUDA error check
+#include <helper_cuda.h> // helper functions for CUDA error check

 namespace cg = cooperative_groups;

 #if __CUDA_ARCH__ >= 700
 template <bool writeSquareRoot>
-__device__ void reduceBlockData(
-    cuda::barrier<cuda::thread_scope_block> &barrier,
-    cg::thread_block_tile<32> &tile32, double &threadSum, double *result) {
-  extern __shared__ double tmp[];
-
-#pragma unroll
-  for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
-    threadSum += tile32.shfl_down(threadSum, offset);
-  }
-  if (tile32.thread_rank() == 0) {
-    tmp[tile32.meta_group_rank()] = threadSum;
-  }
-
-  auto token = barrier.arrive();
-
-  barrier.wait(std::move(token));
-
-  // The warp 0 will perform last round of reduction
-  if (tile32.meta_group_rank() == 0) {
-    double beta = tile32.thread_rank() < tile32.meta_group_size()
-                      ? tmp[tile32.thread_rank()]
-                      : 0.0;
+__device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
+                                cg::thread_block_tile<32>               &tile32,
+                                double                                  &threadSum,
+                                double                                  *result)
+{
+    extern __shared__ double tmp[];

 #pragma unroll
    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
-      beta += tile32.shfl_down(beta, offset);
+        threadSum += tile32.shfl_down(threadSum, offset);
+    }
+    if (tile32.thread_rank() == 0) {
+        tmp[tile32.meta_group_rank()] = threadSum;
    }

-    if (tile32.thread_rank() == 0) {
-      if (writeSquareRoot)
-        *result = sqrt(beta);
-      else
-        *result = beta;
+    auto token = barrier.arrive();
+
+    barrier.wait(std::move(token));
+
+    // The warp 0 will perform last round of reduction
+    if (tile32.meta_group_rank() == 0) {
+        double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
+
+#pragma unroll
+        for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+            beta += tile32.shfl_down(beta, offset);
+        }
+
+        if (tile32.thread_rank() == 0) {
+            if (writeSquareRoot)
+                *result = sqrt(beta);
+            else
+                *result = beta;
+        }
    }
-  }
 }
 #endif

-__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
-                                             double *partialResults, int size) {
+__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
+{
 #if __CUDA_ARCH__ >= 700
 #pragma diag_suppress static_var_with_dynamic_init
-  cg::thread_block cta = cg::this_thread_block();
-  cg::grid_group grid = cg::this_grid();
-  ;
-  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+    cg::thread_block cta  = cg::this_thread_block();
+    cg::grid_group   grid = cg::this_grid();
+    ;
+    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);

-  __shared__ cuda::barrier<cuda::thread_scope_block> barrier;
+    __shared__ cuda::barrier<cuda::thread_scope_block> barrier;

-  if (threadIdx.x == 0) {
-    init(&barrier, blockDim.x);
-  }
-
-  cg::sync(cta);
-
-  double threadSum = 0.0;
-  for (int i = grid.thread_rank(); i < size; i += grid.size()) {
-    threadSum += (double)(vecA[i] * vecB[i]);
-  }
-
-  // Each thread block performs reduction of partial dotProducts and writes to
-  // global mem.
-  reduceBlockData<false>(barrier, tile32, threadSum,
-                         &partialResults[blockIdx.x]);
-
-  cg::sync(grid);
-
-  // One block performs the final summation of partial dot products
-  // of all the thread blocks and writes the sqrt of final dot product.
-  if (blockIdx.x == 0) {
-    threadSum = 0.0;
-    for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
-      threadSum += partialResults[i];
+    if (threadIdx.x == 0) {
+        init(&barrier, blockDim.x);
    }
-    reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
-  }

-  cg::sync(grid);
+    cg::sync(cta);

-  const double finalValue = partialResults[0];
+    double threadSum = 0.0;
+    for (int i = grid.thread_rank(); i < size; i += grid.size()) {
+        threadSum += (double)(vecA[i] * vecB[i]);
+    }

-  // Perform normalization of vecA & vecB.
-  for (int i = grid.thread_rank(); i < size; i += grid.size()) {
-    vecA[i] = (float)vecA[i] / finalValue;
-    vecB[i] = (float)vecB[i] / finalValue;
-  }
+    // Each thread block performs reduction of partial dotProducts and writes to
+    // global mem.
+    reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
+
+    cg::sync(grid);
+
+    // One block performs the final summation of partial dot products
+    // of all the thread blocks and writes the sqrt of final dot product.
+    if (blockIdx.x == 0) {
+        threadSum = 0.0;
+        for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
+            threadSum += partialResults[i];
+        }
+        reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
+    }
+
+    cg::sync(grid);
+
+    const double finalValue = partialResults[0];
+
+    // Perform normalization of vecA & vecB.
+    for (int i = grid.thread_rank(); i < size; i += grid.size()) {
+        vecA[i] = (float)vecA[i] / finalValue;
+        vecB[i] = (float)vecB[i] / finalValue;
+    }
 #endif
 }

@ -137,119 +136,113 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  printf("%s starting...\n", argv[0]);
+int main(int argc, char **argv)
+{
+    printf("%s starting...\n", argv[0]);

-  // This will pick the best possible CUDA capable device
-  int dev = findCudaDevice(argc, (const char **)argv);
+    // This will pick the best possible CUDA capable device
+    int dev = findCudaDevice(argc, (const char **)argv);

-  int major = 0;
-  checkCudaErrors(
-      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+    int major = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));

-  // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
-  if (major < 7) {
-    printf("simpleAWBarrier requires SM 7.0 or higher.  Exiting...\n");
-    exit(EXIT_WAIVED);
-  }
-
-  int supportsCooperativeLaunch = 0;
-  checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch,
-                                         cudaDevAttrCooperativeLaunch, dev));
-
-  if (!supportsCooperativeLaunch) {
-    printf(
-        "\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
-        "Waiving the run\n",
-        dev);
-    exit(EXIT_WAIVED);
-  }
-
-  int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
-
-  printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
-  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
-}
-
-int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
-  float *vecA, *d_vecA;
-  float *vecB, *d_vecB;
-  double *d_partialResults;
-  int size = 10000000;
-
-  checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
-  checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
-
-  checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
-  checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
-
-  float baseVal = 2.0;
-  for (int i = 0; i < size; i++) {
-    vecA[i] = vecB[i] = baseVal;
-  }
-
-  cudaStream_t stream;
-  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-
-  checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size,
-                                  cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
-                                  cudaMemcpyHostToDevice, stream));
-
-  // Kernel configuration, where a one-dimensional
-  // grid and one-dimensional blocks are configured.
-  int minGridSize = 0, blockSize = 0;
-  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
-      &minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
-
-  int smemSize = ((blockSize / 32) + 1) * sizeof(double);
-
-  int numBlocksPerSm = 0;
-  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
-
-  int multiProcessorCount = 0;
-  checkCudaErrors(cudaDeviceGetAttribute(
-      &multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
-
-  minGridSize = multiProcessorCount * numBlocksPerSm;
-  checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
-
-  printf(
-      "Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
-      "blockSize = %d\n",
-      minGridSize, blockSize);
-
-  dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
-
-  void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB,
-                        (void *)&d_partialResults, (void *)&size};
-
-  checkCudaErrors(
-      cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid,
-                                  dimBlock, kernelArgs, smemSize, stream));
-
-  checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size,
-                                  cudaMemcpyDeviceToHost, stream));
-  checkCudaErrors(cudaStreamSynchronize(stream));
-
-  float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
-  unsigned int matches = 0;
-  for (int i = 0; i < size; i++) {
-    if ((vecA[i] - expectedResult) > 0.00001) {
-      printf("mismatch at i = %d\n", i);
-      break;
-    } else {
-      matches++;
+    // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
+    if (major < 7) {
+        printf("simpleAWBarrier requires SM 7.0 or higher.  Exiting...\n");
+        exit(EXIT_WAIVED);
    }
-  }

-  printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
-  checkCudaErrors(cudaFree(d_vecA));
-  checkCudaErrors(cudaFree(d_vecB));
-  checkCudaErrors(cudaFree(d_partialResults));
+    int supportsCooperativeLaunch = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));

-  checkCudaErrors(cudaFreeHost(vecA));
-  checkCudaErrors(cudaFreeHost(vecB));
-  return matches == size;
+    if (!supportsCooperativeLaunch) {
+        printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
+               "Waiving the run\n",
+               dev);
+        exit(EXIT_WAIVED);
+    }
+
+    int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
+
+    printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
+{
+    float  *vecA, *d_vecA;
+    float  *vecB, *d_vecB;
+    double *d_partialResults;
+    int     size = 10000000;
+
+    checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
+    checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
+
+    checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
+    checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
+
+    float baseVal = 2.0;
+    for (int i = 0; i < size; i++) {
+        vecA[i] = vecB[i] = baseVal;
+    }
+
+    cudaStream_t stream;
+    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+    checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
+
+    // Kernel configuration, where a one-dimensional
+    // grid and one-dimensional blocks are configured.
+    int minGridSize = 0, blockSize = 0;
+    checkCudaErrors(
+        cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
+
+    int smemSize = ((blockSize / 32) + 1) * sizeof(double);
+
+    int numBlocksPerSm = 0;
+    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
+
+    int multiProcessorCount = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
+
+    minGridSize = multiProcessorCount * numBlocksPerSm;
+    checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
+
+    printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
+           "blockSize = %d\n",
+           minGridSize,
+           blockSize);
+
+    dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
+
+    void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size};
+
+    checkCudaErrors(cudaLaunchCooperativeKernel(
+        (void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
+
+    checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));
+
+    float        expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
+    unsigned int matches        = 0;
+    for (int i = 0; i < size; i++) {
+        if ((vecA[i] - expectedResult) > 0.00001) {
+            printf("mismatch at i = %d\n", i);
+            break;
+        }
+        else {
+            matches++;
+        }
+    }
+
+    printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
+    checkCudaErrors(cudaFree(d_vecA));
+    checkCudaErrors(cudaFree(d_vecB));
+    checkCudaErrors(cudaFree(d_partialResults));
+
+    checkCudaErrors(cudaFreeHost(vecA));
+    checkCudaErrors(cudaFreeHost(vecB));
+    return matches == size;
 }
--- a/Samples/0_Introduction/simpleAssert/simpleAssert.cu
+++ b/Samples/0_Introduction/simpleAssert/simpleAssert.cu
@ -34,17 +34,17 @@
 #endif

 // Includes, system
-#include <stdio.h>
 #include <cassert>
+#include <stdio.h>

 // Includes CUDA
 #include <cuda_runtime.h>

 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h

 // CUDA helper functions
-#include <helper_cuda.h>  // helper functions for CUDA error check
+#include <helper_cuda.h> // helper functions for CUDA error check

 const char *sampleName = "simpleAssert";

@ -58,9 +58,10 @@ bool testResult = true;
 //! Tests assert function.
 //! Thread whose id > N will print assertion failed error message.
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void testKernel(int N) {
-  int gtid = blockIdx.x * blockDim.x + threadIdx.x;
-  assert(gtid < N);
+__global__ void testKernel(int N)
+{
+    int gtid = blockIdx.x * blockDim.x + threadIdx.x;
+    assert(gtid < N);
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -70,59 +71,60 @@ void runTest(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  printf("%s starting...\n", sampleName);
+int main(int argc, char **argv)
+{
+    printf("%s starting...\n", sampleName);

-  runTest(argc, argv);
+    runTest(argc, argv);

-  printf("%s completed, returned %s\n", sampleName,
-         testResult ? "OK" : "ERROR!");
-  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }

-void runTest(int argc, char **argv) {
-  int Nblocks = 2;
-  int Nthreads = 32;
-  cudaError_t error;
+void runTest(int argc, char **argv)
+{
+    int         Nblocks  = 2;
+    int         Nthreads = 32;
+    cudaError_t error;

 #ifndef _WIN32
-  utsname OS_System_Type;
-  uname(&OS_System_Type);
+    utsname OS_System_Type;
+    uname(&OS_System_Type);

-  printf("OS_System_Type.release = %s\n", OS_System_Type.release);
+    printf("OS_System_Type.release = %s\n", OS_System_Type.release);

-  if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
-    printf("simpleAssert is not current supported on Mac OSX\n\n");
-    exit(EXIT_SUCCESS);
-  } else {
-    printf("OS Info: <%s>\n\n", OS_System_Type.version);
-  }
+    if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
+        printf("simpleAssert is not current supported on Mac OSX\n\n");
+        exit(EXIT_SUCCESS);
+    }
+    else {
+        printf("OS Info: <%s>\n\n", OS_System_Type.version);
+    }

 #endif

-  // This will pick the best possible CUDA capable device
-  findCudaDevice(argc, (const char **)argv);
+    // This will pick the best possible CUDA capable device
+    findCudaDevice(argc, (const char **)argv);

-  // Kernel configuration, where a one-dimensional
-  // grid and one-dimensional blocks are configured.
-  dim3 dimGrid(Nblocks);
-  dim3 dimBlock(Nthreads);
+    // Kernel configuration, where a one-dimensional
+    // grid and one-dimensional blocks are configured.
+    dim3 dimGrid(Nblocks);
+    dim3 dimBlock(Nthreads);

-  printf("Launch kernel to generate assertion failures\n");
-  testKernel<<<dimGrid, dimBlock>>>(60);
+    printf("Launch kernel to generate assertion failures\n");
+    testKernel<<<dimGrid, dimBlock>>>(60);

-  // Synchronize (flushes assert output).
-  printf("\n-- Begin assert output\n\n");
-  error = cudaDeviceSynchronize();
-  printf("\n-- End assert output\n\n");
+    // Synchronize (flushes assert output).
+    printf("\n-- Begin assert output\n\n");
+    error = cudaDeviceSynchronize();
+    printf("\n-- End assert output\n\n");

-  // Check for errors and failed asserts in asynchronous kernel launch.
-  if (error == cudaErrorAssert) {
-    printf(
-        "Device assert failed as expected, "
-        "CUDA error message is: %s\n\n",
-        cudaGetErrorString(error));
-  }
+    // Check for errors and failed asserts in asynchronous kernel launch.
+    if (error == cudaErrorAssert) {
+        printf("Device assert failed as expected, "
+               "CUDA error message is: %s\n\n",
+               cudaGetErrorString(error));
+    }

-  testResult = error == cudaErrorAssert;
+    testResult = error == cudaErrorAssert;
 }
--- a/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert.cpp
+++ b/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert.cpp
@ -34,15 +34,16 @@
 #endif

 // Includes, system
-#include <stdio.h>
 #include <cassert>
+#include <stdio.h>

 // Includes CUDA
 #include <cuda_runtime.h>
+
 #include "nvrtc_helper.h"

 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h

 const char *sampleName = "simpleAssert_nvrtc";

@ -58,56 +59,63 @@ void runTest(int argc, char **argv);
 // Program main
 ////////////////////////////////////////////////////////////////////////////////

-int main(int argc, char **argv) {
-  printf("%s starting...\n", sampleName);
+int main(int argc, char **argv)
+{
+    printf("%s starting...\n", sampleName);

-  runTest(argc, argv);
+    runTest(argc, argv);

-  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }

-void runTest(int argc, char **argv) {
-  int Nblocks = 2;
-  int Nthreads = 32;
+void runTest(int argc, char **argv)
+{
+    int Nblocks  = 2;
+    int Nthreads = 32;

-  // Kernel configuration, where a one-dimensional
-  // grid and one-dimensional blocks are configured.
+    // Kernel configuration, where a one-dimensional
+    // grid and one-dimensional blocks are configured.

-  dim3 dimGrid(Nblocks);
-  dim3 dimBlock(Nthreads);
+    dim3 dimGrid(Nblocks);
+    dim3 dimBlock(Nthreads);

-  printf("Launch kernel to generate assertion failures\n");
-  char *cubin, *kernel_file;
-  size_t cubinSize;
+    printf("Launch kernel to generate assertion failures\n");
+    char  *cubin, *kernel_file;
+    size_t cubinSize;

-  kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]);
-  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
+    kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]);
+    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);

-  CUmodule module = loadCUBIN(cubin, argc, argv);
-  CUfunction kernel_addr;
+    CUmodule   module = loadCUBIN(cubin, argc, argv);
+    CUfunction kernel_addr;

-  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
+    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));

-  int count = 60;
-  void *args[] = {(void *)&count};
+    int   count  = 60;
+    void *args[] = {(void *)&count};

-  checkCudaErrors(cuLaunchKernel(
-      kernel_addr, dimGrid.x, dimGrid.y, dimGrid.z, /* grid dim */
-      dimBlock.x, dimBlock.y, dimBlock.z,           /* block dim */
-      0, 0,                                         /* shared mem, stream */
-      &args[0],                                     /* arguments */
-      0));
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
+                                   dimGrid.x,
+                                   dimGrid.y,
+                                   dimGrid.z, /* grid dim */
+                                   dimBlock.x,
+                                   dimBlock.y,
+                                   dimBlock.z, /* block dim */
+                                   0,
+                                   0,        /* shared mem, stream */
+                                   &args[0], /* arguments */
+                                   0));

-  // Synchronize (flushes assert output).
-  printf("\n-- Begin assert output\n\n");
-  CUresult res = cuCtxSynchronize();
+    // Synchronize (flushes assert output).
+    printf("\n-- Begin assert output\n\n");
+    CUresult res = cuCtxSynchronize();

-  printf("\n-- End assert output\n\n");
+    printf("\n-- End assert output\n\n");

-  // Check for errors and failed asserts in asynchronous kernel launch.
-  if (res == CUDA_ERROR_ASSERT) {
-    printf("Device assert failed as expected\n");
-  }
+    // Check for errors and failed asserts in asynchronous kernel launch.
+    if (res == CUDA_ERROR_ASSERT) {
+        printf("Device assert failed as expected\n");
+    }

-  testResult = res == CUDA_ERROR_ASSERT;
+    testResult = res == CUDA_ERROR_ASSERT;
 }
--- a/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert_kernel.cu
+++ b/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert_kernel.cu
@ -32,7 +32,8 @@
 //! Thread whose id > N will print assertion failed error message.
 ////////////////////////////////////////////////////////////////////////////////

-extern "C" __global__ void testKernel(int N) {
-  int gtid = blockIdx.x * blockDim.x + threadIdx.x;
-  assert(gtid < N);
+extern "C" __global__ void testKernel(int N)
+{
+    int gtid = blockIdx.x * blockDim.x + threadIdx.x;
+    assert(gtid < N);
 }
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics.cu
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics.cu
@ -30,10 +30,10 @@
 */

 // includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -45,10 +45,10 @@
 #include <cuda_runtime.h>

 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h

 // CUDA helper functions
-#include <helper_cuda.h>  // helper functions for CUDA error check
+#include <helper_cuda.h> // helper functions for CUDA error check

 // Includes, kernels
 #include "simpleAtomicIntrinsics_kernel.cuh"
@ -68,67 +68,67 @@ extern "C" bool computeGold(int *gpuData, const int len);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  printf("%s starting...\n", sampleName);
+int main(int argc, char **argv)
+{
+    printf("%s starting...\n", sampleName);

-  runTest(argc, argv);
+    runTest(argc, argv);

-  printf("%s completed, returned %s\n", sampleName,
-         testResult ? "OK" : "ERROR!");
-  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
-  cudaStream_t stream;
-  // This will pick the best possible CUDA capable device
-  findCudaDevice(argc, (const char **)argv);
+void runTest(int argc, char **argv)
+{
+    cudaStream_t stream;
+    // This will pick the best possible CUDA capable device
+    findCudaDevice(argc, (const char **)argv);

-  StopWatchInterface *timer;
-  sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
+    StopWatchInterface *timer;
+    sdkCreateTimer(&timer);
+    sdkStartTimer(&timer);

-  unsigned int numThreads = 256;
-  unsigned int numBlocks = 64;
-  unsigned int numData = 11;
-  unsigned int memSize = sizeof(int) * numData;
+    unsigned int numThreads = 256;
+    unsigned int numBlocks  = 64;
+    unsigned int numData    = 11;
+    unsigned int memSize    = sizeof(int) * numData;

-  // allocate mem for the result on host side
-  int *hOData;
-  checkCudaErrors(cudaMallocHost(&hOData, memSize));
+    // allocate mem for the result on host side
+    int *hOData;
+    checkCudaErrors(cudaMallocHost(&hOData, memSize));

-  // initialize the memory
-  for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
+    // initialize the memory
+    for (unsigned int i = 0; i < numData; i++)
+        hOData[i] = 0;

-  // To make the AND and XOR tests generate something other than 0...
-  hOData[8] = hOData[10] = 0xff;
+    // To make the AND and XOR tests generate something other than 0...
+    hOData[8] = hOData[10] = 0xff;

-  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  // allocate device memory for result
-  int *dOData;
-  checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
-  // copy host memory to device to initialize to zero
-  checkCudaErrors(
-      cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    // allocate device memory for result
+    int *dOData;
+    checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
+    // copy host memory to device to initialize to zero
+    checkCudaErrors(cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));

-  // execute the kernel
-  testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
+    // execute the kernel
+    testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);

-  // Copy result from device to host
-  checkCudaErrors(
-      cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
-  checkCudaErrors(cudaStreamSynchronize(stream));
+    // Copy result from device to host
+    checkCudaErrors(cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));

-  sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  sdkDeleteTimer(&timer);
+    sdkStopTimer(&timer);
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    sdkDeleteTimer(&timer);

-  // Compute reference solution
-  testResult = computeGold(hOData, numThreads * numBlocks);
+    // Compute reference solution
+    testResult = computeGold(hOData, numThreads * numBlocks);

-  // Cleanup memory
-  checkCudaErrors(cudaFreeHost(hOData));
-  checkCudaErrors(cudaFree(dOData));
+    // Cleanup memory
+    checkCudaErrors(cudaFreeHost(hOData));
+    checkCudaErrors(cudaFree(dOData));
 }
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_cpu.cpp
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_cpu.cpp
@ -42,141 +42,142 @@ extern "C" int computeGold(int *gpuData, const int len);
 //! @param idata      input data as provided to device
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
-int computeGold(int *gpuData, const int len) {
-  int val = 0;
+int computeGold(int *gpuData, const int len)
+{
+    int val = 0;

-  for (int i = 0; i < len; ++i) {
-    val += 10;
-  }
-
-  if (val != gpuData[0]) {
-    printf("atomicAdd failed\n");
-    return false;
-  }
-
-  val = 0;
-
-  for (int i = 0; i < len; ++i) {
-    val -= 10;
-  }
-
-  if (val != gpuData[1]) {
-    printf("atomicSub failed\n");
-    return false;
-  }
-
-  bool found = false;
-
-  for (int i = 0; i < len; ++i) {
-    // third element should be a member of [0, len)
-    if (i == gpuData[2]) {
-      found = true;
-      break;
+    for (int i = 0; i < len; ++i) {
+        val += 10;
    }
-  }

-  if (!found) {
-    printf("atomicExch failed\n");
-    return false;
-  }
-
-  val = -(1 << 8);
-
-  for (int i = 0; i < len; ++i) {
-    // fourth element should be len-1
-    val = max(val, i);
-  }
-
-  if (val != gpuData[3]) {
-    printf("atomicMax failed\n");
-    return false;
-  }
-
-  val = 1 << 8;
-
-  for (int i = 0; i < len; ++i) {
-    val = min(val, i);
-  }
-
-  if (val != gpuData[4]) {
-    printf("atomicMin failed\n");
-    return false;
-  }
-
-  int limit = 17;
-  val = 0;
-
-  for (int i = 0; i < len; ++i) {
-    val = (val >= limit) ? 0 : val + 1;
-  }
-
-  if (val != gpuData[5]) {
-    printf("atomicInc failed\n");
-    return false;
-  }
-
-  limit = 137;
-  val = 0;
-
-  for (int i = 0; i < len; ++i) {
-    val = ((val == 0) || (val > limit)) ? limit : val - 1;
-  }
-
-  if (val != gpuData[6]) {
-    printf("atomicDec failed\n");
-    return false;
-  }
-
-  found = false;
-
-  for (int i = 0; i < len; ++i) {
-    // eighth element should be a member of [0, len)
-    if (i == gpuData[7]) {
-      found = true;
-      break;
+    if (val != gpuData[0]) {
+        printf("atomicAdd failed\n");
+        return false;
    }
-  }

-  if (!found) {
-    printf("atomicCAS failed\n");
-    return false;
-  }
+    val = 0;

-  val = 0xff;
+    for (int i = 0; i < len; ++i) {
+        val -= 10;
+    }

-  for (int i = 0; i < len; ++i) {
-    // 9th element should be 1
-    val &= (2 * i + 7);
-  }
+    if (val != gpuData[1]) {
+        printf("atomicSub failed\n");
+        return false;
+    }

-  if (val != gpuData[8]) {
-    printf("atomicAnd failed\n");
-    return false;
-  }
+    bool found = false;

-  val = 0;
+    for (int i = 0; i < len; ++i) {
+        // third element should be a member of [0, len)
+        if (i == gpuData[2]) {
+            found = true;
+            break;
+        }
+    }

-  for (int i = 0; i < len; ++i) {
-    // 10th element should be 0xff
-    val |= (1 << i);
-  }
+    if (!found) {
+        printf("atomicExch failed\n");
+        return false;
+    }

-  if (val != gpuData[9]) {
-    printf("atomicOr failed\n");
-    return false;
-  }
+    val = -(1 << 8);

-  val = 0xff;
+    for (int i = 0; i < len; ++i) {
+        // fourth element should be len-1
+        val = max(val, i);
+    }

-  for (int i = 0; i < len; ++i) {
-    // 11th element should be 0xff
-    val ^= i;
-  }
+    if (val != gpuData[3]) {
+        printf("atomicMax failed\n");
+        return false;
+    }

-  if (val != gpuData[10]) {
-    printf("atomicXor failed\n");
-    return false;
-  }
+    val = 1 << 8;

-  return true;
+    for (int i = 0; i < len; ++i) {
+        val = min(val, i);
+    }
+
+    if (val != gpuData[4]) {
+        printf("atomicMin failed\n");
+        return false;
+    }
+
+    int limit = 17;
+    val       = 0;
+
+    for (int i = 0; i < len; ++i) {
+        val = (val >= limit) ? 0 : val + 1;
+    }
+
+    if (val != gpuData[5]) {
+        printf("atomicInc failed\n");
+        return false;
+    }
+
+    limit = 137;
+    val   = 0;
+
+    for (int i = 0; i < len; ++i) {
+        val = ((val == 0) || (val > limit)) ? limit : val - 1;
+    }
+
+    if (val != gpuData[6]) {
+        printf("atomicDec failed\n");
+        return false;
+    }
+
+    found = false;
+
+    for (int i = 0; i < len; ++i) {
+        // eighth element should be a member of [0, len)
+        if (i == gpuData[7]) {
+            found = true;
+            break;
+        }
+    }
+
+    if (!found) {
+        printf("atomicCAS failed\n");
+        return false;
+    }
+
+    val = 0xff;
+
+    for (int i = 0; i < len; ++i) {
+        // 9th element should be 1
+        val &= (2 * i + 7);
+    }
+
+    if (val != gpuData[8]) {
+        printf("atomicAnd failed\n");
+        return false;
+    }
+
+    val = 0;
+
+    for (int i = 0; i < len; ++i) {
+        // 10th element should be 0xff
+        val |= (1 << i);
+    }
+
+    if (val != gpuData[9]) {
+        printf("atomicOr failed\n");
+        return false;
+    }
+
+    val = 0xff;
+
+    for (int i = 0; i < len; ++i) {
+        // 11th element should be 0xff
+        val ^= i;
+    }
+
+    if (val != gpuData[10]) {
+        printf("atomicXor failed\n");
+        return false;
+    }
+
+    return true;
 }
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_kernel.cuh
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_kernel.cuh
@ -35,48 +35,49 @@
 //! @param g_idata  input data in global memory
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void testKernel(int *g_odata) {
-  // access thread id
-  const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
+__global__ void testKernel(int *g_odata)
+{
+    // access thread id
+    const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;

-  // Test various atomic instructions
+    // Test various atomic instructions

-  // Arithmetic atomic instructions
+    // Arithmetic atomic instructions

-  // Atomic addition
-  atomicAdd(&g_odata[0], 10);
+    // Atomic addition
+    atomicAdd(&g_odata[0], 10);

-  // Atomic subtraction (final should be 0)
-  atomicSub(&g_odata[1], 10);
+    // Atomic subtraction (final should be 0)
+    atomicSub(&g_odata[1], 10);

-  // Atomic exchange
-  atomicExch(&g_odata[2], tid);
+    // Atomic exchange
+    atomicExch(&g_odata[2], tid);

-  // Atomic maximum
-  atomicMax(&g_odata[3], tid);
+    // Atomic maximum
+    atomicMax(&g_odata[3], tid);

-  // Atomic minimum
-  atomicMin(&g_odata[4], tid);
+    // Atomic minimum
+    atomicMin(&g_odata[4], tid);

-  // Atomic increment (modulo 17+1)
-  atomicInc((unsigned int *)&g_odata[5], 17);
+    // Atomic increment (modulo 17+1)
+    atomicInc((unsigned int *)&g_odata[5], 17);

-  // Atomic decrement
-  atomicDec((unsigned int *)&g_odata[6], 137);
+    // Atomic decrement
+    atomicDec((unsigned int *)&g_odata[6], 137);

-  // Atomic compare-and-swap
-  atomicCAS(&g_odata[7], tid - 1, tid);
+    // Atomic compare-and-swap
+    atomicCAS(&g_odata[7], tid - 1, tid);

-  // Bitwise atomic instructions
+    // Bitwise atomic instructions

-  // Atomic AND
-  atomicAnd(&g_odata[8], 2 * tid + 7);
+    // Atomic AND
+    atomicAnd(&g_odata[8], 2 * tid + 7);

-  // Atomic OR
-  atomicOr(&g_odata[9], 1 << tid);
+    // Atomic OR
+    atomicOr(&g_odata[9], 1 << tid);

-  // Atomic XOR
-  atomicXor(&g_odata[10], tid);
+    // Atomic XOR
+    atomicXor(&g_odata[10], tid);
 }

-#endif  // #ifndef _SIMPLEATOMICS_KERNEL_H_
+#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics.cpp
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics.cpp
@ -30,10 +30,10 @@
 */

 // includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -46,7 +46,7 @@
 #include <nvrtc_helper.h>

 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h

 const char *sampleName = "simpleAtomicIntrinsics_nvrtc";

@ -64,84 +64,90 @@ extern "C" bool computeGold(int *gpuData, const int len);
 // Program main
 ////////////////////////////////////////////////////////////////////////////////

-int main(int argc, char **argv) {
-  printf("%s starting...\n", sampleName);
+int main(int argc, char **argv)
+{
+    printf("%s starting...\n", sampleName);

-  runTest(argc, argv);
+    runTest(argc, argv);

-  printf("%s completed, returned %s\n", sampleName,
-         testResult ? "OK" : "ERROR!");
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");

-  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////

-void runTest(int argc, char **argv) {
-  int dev = 0;
+void runTest(int argc, char **argv)
+{
+    int dev = 0;

-  char *cubin, *kernel_file;
-  size_t cubinSize;
+    char  *cubin, *kernel_file;
+    size_t cubinSize;

-  kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]);
-  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
+    kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]);
+    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);

-  CUmodule module = loadCUBIN(cubin, argc, argv);
-  CUfunction kernel_addr;
+    CUmodule   module = loadCUBIN(cubin, argc, argv);
+    CUfunction kernel_addr;

-  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
+    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));

-  StopWatchInterface *timer;
-  sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
+    StopWatchInterface *timer;
+    sdkCreateTimer(&timer);
+    sdkStartTimer(&timer);

-  unsigned int numThreads = 256;
-  unsigned int numBlocks = 64;
-  unsigned int numData = 11;
-  unsigned int memSize = sizeof(int) * numData;
+    unsigned int numThreads = 256;
+    unsigned int numBlocks  = 64;
+    unsigned int numData    = 11;
+    unsigned int memSize    = sizeof(int) * numData;

-  // allocate mem for the result on host side
-  int *hOData = (int *)malloc(memSize);
+    // allocate mem for the result on host side
+    int *hOData = (int *)malloc(memSize);

-  // initialize the memory
-  for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
+    // initialize the memory
+    for (unsigned int i = 0; i < numData; i++)
+        hOData[i] = 0;

-  // To make the AND and XOR tests generate something other than 0...
-  hOData[8] = hOData[10] = 0xff;
+    // To make the AND and XOR tests generate something other than 0...
+    hOData[8] = hOData[10] = 0xff;

-  // allocate device memory for result
-  CUdeviceptr dOData;
-  checkCudaErrors(cuMemAlloc(&dOData, memSize));
-  checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize));
+    // allocate device memory for result
+    CUdeviceptr dOData;
+    checkCudaErrors(cuMemAlloc(&dOData, memSize));
+    checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize));

-  // execute the kernel
-  dim3 cudaBlockSize(numThreads, 1, 1);
-  dim3 cudaGridSize(numBlocks, 1, 1);
+    // execute the kernel
+    dim3 cudaBlockSize(numThreads, 1, 1);
+    dim3 cudaGridSize(numBlocks, 1, 1);

-  void *arr[] = {(void *)&dOData};
-  checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
-                                 cudaGridSize.z, /* grid dim */
-                                 cudaBlockSize.x, cudaBlockSize.y,
-                                 cudaBlockSize.z, /* block dim */
-                                 0, 0,            /* shared mem, stream */
-                                 &arr[0],         /* arguments */
-                                 0));
+    void *arr[] = {(void *)&dOData};
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
+                                   cudaGridSize.x,
+                                   cudaGridSize.y,
+                                   cudaGridSize.z, /* grid dim */
+                                   cudaBlockSize.x,
+                                   cudaBlockSize.y,
+                                   cudaBlockSize.z, /* block dim */
+                                   0,
+                                   0,       /* shared mem, stream */
+                                   &arr[0], /* arguments */
+                                   0));

-  checkCudaErrors(cuCtxSynchronize());
+    checkCudaErrors(cuCtxSynchronize());

-  checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize));
+    checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize));

-  // Copy result from device to host
-  sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  sdkDeleteTimer(&timer);
+    // Copy result from device to host
+    sdkStopTimer(&timer);
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    sdkDeleteTimer(&timer);

-  // Compute reference solution
-  testResult = computeGold(hOData, numThreads * numBlocks);
+    // Compute reference solution
+    testResult = computeGold(hOData, numThreads * numBlocks);

-  // Cleanup memory
-  free(hOData);
-  checkCudaErrors(cuMemFree(dOData));
+    // Cleanup memory
+    free(hOData);
+    checkCudaErrors(cuMemFree(dOData));
 }
--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_cpu.cpp
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_cpu.cpp
@ -43,139 +43,140 @@ extern "C" int computeGold(int *gpuData, const int len);
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////

-int computeGold(int *gpuData, const int len) {
-  int val = 0;
+int computeGold(int *gpuData, const int len)
+{
+    int val = 0;

-  for (int i = 0; i < len; ++i) {
-    val += 10;
-  }
-
-  if (val != gpuData[0]) {
-    printf("atomicAdd failed\n");
-    return false;
-  }
-
-  val = 0;
-
-  for (int i = 0; i < len; ++i) {
-    val -= 10;
-  }
-
-  if (val != gpuData[1]) {
-    printf("atomicSub failed\n");
-    return false;
-  }
-
-  bool found = false;
-
-  for (int i = 0; i < len; ++i) {
-    // third element should be a member of [0, len)
-    if (i == gpuData[2]) {
-      found = true;
-      break;
+    for (int i = 0; i < len; ++i) {
+        val += 10;
    }
-  }

-  if (!found) {
-    printf("atomicExch failed\n");
-    return false;
-  }
-
-  val = -(1 << 8);
-
-  for (int i = 0; i < len; ++i) {
-    // fourth element should be len-1
-    val = max(val, i);
-  }
-
-  if (val != gpuData[3]) {
-    printf("atomicMax failed\n");
-    return false;
-  }
-
-  val = 1 << 8;
-
-  for (int i = 0; i < len; ++i) {
-    val = min(val, i);
-  }
-
-  if (val != gpuData[4]) {
-    printf("atomicMin failed\n");
-    return false;
-  }
-
-  int limit = 17;
-  val = 0;
-
-  for (int i = 0; i < len; ++i) {
-    val = (val >= limit) ? 0 : val + 1;
-  }
-
-  if (val != gpuData[5]) {
-    printf("atomicInc failed\n");
-    return false;
-  }
-
-  limit = 137;
-  val = 0;
-
-  for (int i = 0; i < len; ++i) {
-    val = ((val == 0) || (val > limit)) ? limit : val - 1;
-  }
-
-  if (val != gpuData[6]) {
-    printf("atomicDec failed\n");
-    return false;
-  }
-
-  found = false;
-
-  for (int i = 0; i < len; ++i) {
-    // eighth element should be a member of [0, len)
-    if (i == gpuData[7]) {
-      found = true;
-      break;
+    if (val != gpuData[0]) {
+        printf("atomicAdd failed\n");
+        return false;
    }
-  }

-  if (!found) {
-    printf("atomicCAS failed\n");
-    return false;
-  }
+    val = 0;

-  val = 0xff;
-  for (int i = 0; i < len; ++i) {
-    // 9th element should be 1
-    val &= (2 * i + 7);
-  }
+    for (int i = 0; i < len; ++i) {
+        val -= 10;
+    }

-  if (val != gpuData[8]) {
-    printf("atomicAnd failed\n");
-    return false;
-  }
+    if (val != gpuData[1]) {
+        printf("atomicSub failed\n");
+        return false;
+    }

-  val = 0;
-  for (int i = 0; i < len; ++i) {
-    // 10th element should be 0xff
-    val |= (1 << i);
-  }
+    bool found = false;

-  if (val != gpuData[9]) {
-    printf("atomicOr failed\n");
-    return false;
-  }
+    for (int i = 0; i < len; ++i) {
+        // third element should be a member of [0, len)
+        if (i == gpuData[2]) {
+            found = true;
+            break;
+        }
+    }

-  val = 0xff;
+    if (!found) {
+        printf("atomicExch failed\n");
+        return false;
+    }

-  for (int i = 0; i < len; ++i) {
-    // 11th element should be 0xff
-    val ^= i;
-  }
+    val = -(1 << 8);

-  if (val != gpuData[10]) {
-    printf("atomicXor failed\n");
-    return false;
-  }
+    for (int i = 0; i < len; ++i) {
+        // fourth element should be len-1
+        val = max(val, i);
+    }

-  return true;
+    if (val != gpuData[3]) {
+        printf("atomicMax failed\n");
+        return false;
+    }
+
+    val = 1 << 8;
+
+    for (int i = 0; i < len; ++i) {
+        val = min(val, i);
+    }
+
+    if (val != gpuData[4]) {
+        printf("atomicMin failed\n");
+        return false;
+    }
+
+    int limit = 17;
+    val       = 0;
+
+    for (int i = 0; i < len; ++i) {
+        val = (val >= limit) ? 0 : val + 1;
+    }
+
+    if (val != gpuData[5]) {
+        printf("atomicInc failed\n");
+        return false;
+    }
+
+    limit = 137;
+    val   = 0;
+
+    for (int i = 0; i < len; ++i) {
+        val = ((val == 0) || (val > limit)) ? limit : val - 1;
+    }
+
+    if (val != gpuData[6]) {
+        printf("atomicDec failed\n");
+        return false;
+    }
+
+    found = false;
+
+    for (int i = 0; i < len; ++i) {
+        // eighth element should be a member of [0, len)
+        if (i == gpuData[7]) {
+            found = true;
+            break;
+        }
+    }
+
+    if (!found) {
+        printf("atomicCAS failed\n");
+        return false;
+    }
+
+    val = 0xff;
+    for (int i = 0; i < len; ++i) {
+        // 9th element should be 1
+        val &= (2 * i + 7);
+    }
+
+    if (val != gpuData[8]) {
+        printf("atomicAnd failed\n");
+        return false;
+    }
+
+    val = 0;
+    for (int i = 0; i < len; ++i) {
+        // 10th element should be 0xff
+        val |= (1 << i);
+    }
+
+    if (val != gpuData[9]) {
+        printf("atomicOr failed\n");
+        return false;
+    }
+
+    val = 0xff;
+
+    for (int i = 0; i < len; ++i) {
+        // 11th element should be 0xff
+        val ^= i;
+    }
+
+    if (val != gpuData[10]) {
+        printf("atomicXor failed\n");
+        return false;
+    }
+
+    return true;
 }
--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_kernel.cuh
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_kernel.cuh
@ -36,45 +36,46 @@
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////

-extern "C" __global__ void testKernel(int *g_odata) {
-  // access thread id
-  const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
+extern "C" __global__ void testKernel(int *g_odata)
+{
+    // access thread id
+    const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;

-  // Test various atomic instructions
-  // Arithmetic atomic instructions
-  // Atomic addition
-  atomicAdd(&g_odata[0], 10);
+    // Test various atomic instructions
+    // Arithmetic atomic instructions
+    // Atomic addition
+    atomicAdd(&g_odata[0], 10);

-  // Atomic subtraction (final should be 0)
-  atomicSub(&g_odata[1], 10);
+    // Atomic subtraction (final should be 0)
+    atomicSub(&g_odata[1], 10);

-  // Atomic exchange
-  atomicExch(&g_odata[2], tid);
+    // Atomic exchange
+    atomicExch(&g_odata[2], tid);

-  // Atomic maximum
-  atomicMax(&g_odata[3], tid);
+    // Atomic maximum
+    atomicMax(&g_odata[3], tid);

-  // Atomic minimum
-  atomicMin(&g_odata[4], tid);
+    // Atomic minimum
+    atomicMin(&g_odata[4], tid);

-  // Atomic increment (modulo 17+1)
-  atomicInc((unsigned int *)&g_odata[5], 17);
+    // Atomic increment (modulo 17+1)
+    atomicInc((unsigned int *)&g_odata[5], 17);

-  // Atomic decrement
-  atomicDec((unsigned int *)&g_odata[6], 137);
+    // Atomic decrement
+    atomicDec((unsigned int *)&g_odata[6], 137);

-  // Atomic compare-and-swap
-  atomicCAS(&g_odata[7], tid - 1, tid);
+    // Atomic compare-and-swap
+    atomicCAS(&g_odata[7], tid - 1, tid);

-  // Bitwise atomic instructions
-  // Atomic AND
-  atomicAnd(&g_odata[8], 2 * tid + 7);
+    // Bitwise atomic instructions
+    // Atomic AND
+    atomicAnd(&g_odata[8], 2 * tid + 7);

-  // Atomic OR
-  atomicOr(&g_odata[9], 1 << tid);
+    // Atomic OR
+    atomicOr(&g_odata[9], 1 << tid);

-  // Atomic XOR
-  atomicXor(&g_odata[10], tid);
+    // Atomic XOR
+    atomicXor(&g_odata[10], tid);
 }

-#endif  // #ifndef _SIMPLEATOMICS_KERNEL_H_
+#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
--- a/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu
+++ b/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu
@ -26,30 +26,31 @@
 */

 // includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 // includes CUDA
 #include <cuda_runtime.h>

 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper functions for SDK examples
+#include <helper_functions.h> // helper functions for SDK examples

 ////////////////////////////////////////////////////////////////////////////////
 // declaration, forward
 void runTest(int argc, char **argv);

-cudaAccessPolicyWindow initAccessPolicyWindow(void) {
-  cudaAccessPolicyWindow accessPolicyWindow = {0};
-  accessPolicyWindow.base_ptr = (void *)0;
-  accessPolicyWindow.num_bytes = 0;
-  accessPolicyWindow.hitRatio = 0.f;
-  accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
-  accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
-  return accessPolicyWindow;
+cudaAccessPolicyWindow initAccessPolicyWindow(void)
+{
+    cudaAccessPolicyWindow accessPolicyWindow = {0};
+    accessPolicyWindow.base_ptr               = (void *)0;
+    accessPolicyWindow.num_bytes              = 0;
+    accessPolicyWindow.hitRatio               = 0.f;
+    accessPolicyWindow.hitProp                = cudaAccessPropertyNormal;
+    accessPolicyWindow.missProp               = cudaAccessPropertyStreaming;
+    return accessPolicyWindow;
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -60,35 +61,35 @@ cudaAccessPolicyWindow initAccessPolicyWindow(void) {
 //! @param bigDataSize  input bigData size
 //! @param hitcount how many data access are done within block
 ////////////////////////////////////////////////////////////////////////////////
-static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
-                                            int bigDataSize, int hitCount) {
-  __shared__ unsigned int hit;
-  int row = blockIdx.y * blockDim.y + threadIdx.y;
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-  int tID = row * blockDim.y + col;
-  uint32_t psRand = tID;
+static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, int bigDataSize, int hitCount)
+{
+    __shared__ unsigned int hit;
+    int                     row    = blockIdx.y * blockDim.y + threadIdx.y;
+    int                     col    = blockIdx.x * blockDim.x + threadIdx.x;
+    int                     tID    = row * blockDim.y + col;
+    uint32_t                psRand = tID;

-  atomicExch(&hit, 0);
-  __syncthreads();
-  while (hit < hitCount) {
-    psRand ^= psRand << 13;
-    psRand ^= psRand >> 17;
-    psRand ^= psRand << 5;
+    atomicExch(&hit, 0);
+    __syncthreads();
+    while (hit < hitCount) {
+        psRand ^= psRand << 13;
+        psRand ^= psRand >> 17;
+        psRand ^= psRand << 5;

-    int idx = tID - psRand;
-    if (idx < 0) {
-      idx = -idx;
+        int idx = tID - psRand;
+        if (idx < 0) {
+            idx = -idx;
+        }
+
+        if ((tID % 2) == 0) {
+            data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
+        }
+        else {
+            trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize];
+        }
+
+        atomicAdd(&hit, 1);
    }
-
-    if ((tID % 2) == 0) {
-      data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
-    } else {
-      trash[psRand % bigDataSize] =
-          trash[psRand % bigDataSize] + trash[idx % bigDataSize];
-    }
-
-    atomicAdd(&hit, 1);
-  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
@ -98,117 +99,110 @@ int main(int argc, char **argv) { runTest(argc, argv); }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
-  bool bTestResult = true;
-  cudaAccessPolicyWindow accessPolicyWindow;
-  cudaDeviceProp deviceProp;
-  cudaStreamAttrValue streamAttrValue;
-  cudaStream_t stream;
-  cudaStreamAttrID streamAttrID;
-  dim3 threads(32, 32);
-  int *dataDevicePointer;
-  int *dataHostPointer;
-  int dataSize;
-  int *bigDataDevicePointer;
-  int *bigDataHostPointer;
-  int bigDataSize;
-  StopWatchInterface *timer = 0;
+void runTest(int argc, char **argv)
+{
+    bool                   bTestResult = true;
+    cudaAccessPolicyWindow accessPolicyWindow;
+    cudaDeviceProp         deviceProp;
+    cudaStreamAttrValue    streamAttrValue;
+    cudaStream_t           stream;
+    cudaStreamAttrID       streamAttrID;
+    dim3                   threads(32, 32);
+    int                   *dataDevicePointer;
+    int                   *dataHostPointer;
+    int                    dataSize;
+    int                   *bigDataDevicePointer;
+    int                   *bigDataHostPointer;
+    int                    bigDataSize;
+    StopWatchInterface    *timer = 0;

-  printf("%s Starting...\n\n", argv[0]);
+    printf("%s Starting...\n\n", argv[0]);

-  // use command-line specified CUDA device, otherwise use device with highest
-  // Gflops/s
-  int devID = findCudaDevice(argc, (const char **)argv);
-  sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
-  // Get device properties
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
-  dim3 blocks(deviceProp.maxGridSize[1], 1);
+    // use command-line specified CUDA device, otherwise use device with highest
+    // Gflops/s
+    int devID = findCudaDevice(argc, (const char **)argv);
+    sdkCreateTimer(&timer);
+    sdkStartTimer(&timer);
+    // Get device properties
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+    dim3 blocks(deviceProp.maxGridSize[1], 1);

-  // Make sure device the l2 optimization
-  if (deviceProp.persistingL2CacheMaxSize == 0) {
-    printf(
-        "Waiving execution as device %d does not support persisting L2 "
-        "Caching\n",
-        devID);
-    exit(EXIT_WAIVED);
-  }
-
-  // Create stream to assiocate with window
-  checkCudaErrors(cudaStreamCreate(&stream));
-
-  // Set the amount of l2 cache that will be persisting to maximum the device
-  // can support
-  checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
-                                     deviceProp.persistingL2CacheMaxSize));
-
-  // Stream attribute to set
-  streamAttrID = cudaStreamAttributeAccessPolicyWindow;
-
-  // Default window
-  streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
-  accessPolicyWindow = initAccessPolicyWindow();
-
-  // Allocate size of both buffers
-  bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
-  dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);
-
-  // Allocate data
-  checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
-  checkCudaErrors(
-      cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
-
-  for (int i = 0; i < bigDataSize; ++i) {
-    if (i < dataSize) {
-      dataHostPointer[i] = i;
+    // Make sure device the l2 optimization
+    if (deviceProp.persistingL2CacheMaxSize == 0) {
+        printf("Waiving execution as device %d does not support persisting L2 "
+               "Caching\n",
+               devID);
+        exit(EXIT_WAIVED);
    }

-    bigDataHostPointer[bigDataSize - i - 1] = i;
-  }
+    // Create stream to assiocate with window
+    checkCudaErrors(cudaStreamCreate(&stream));

-  checkCudaErrors(
-      cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
-  checkCudaErrors(
-      cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
-  checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
-                                  dataSize * sizeof(int),
-                                  cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
-                                  bigDataSize * sizeof(int),
-                                  cudaMemcpyHostToDevice, stream));
+    // Set the amount of l2 cache that will be persisting to maximum the device
+    // can support
+    checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize));

-  // Make a window for the buffer of interest
-  accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
-  accessPolicyWindow.num_bytes = dataSize * sizeof(int);
-  accessPolicyWindow.hitRatio = 1.f;
-  accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
-  accessPolicyWindow.missProp = cudaAccessPropertyNormal;
-  streamAttrValue.accessPolicyWindow = accessPolicyWindow;
+    // Stream attribute to set
+    streamAttrID = cudaStreamAttributeAccessPolicyWindow;

-  // Assign window to stream
-  checkCudaErrors(
-      cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
+    // Default window
+    streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
+    accessPolicyWindow                 = initAccessPolicyWindow();

-  // Demote any previous persisting lines
-  checkCudaErrors(cudaCtxResetPersistingL2Cache());
+    // Allocate size of both buffers
+    bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
+    dataSize    = (deviceProp.l2CacheSize / 4) / sizeof(int);

-  checkCudaErrors(cudaStreamSynchronize(stream));
-  kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
-      dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
+    // Allocate data
+    checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
+    checkCudaErrors(cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));

-  checkCudaErrors(cudaStreamSynchronize(stream));
-  // check if kernel execution generated and error
-  getLastCudaError("Kernel execution failed");
+    for (int i = 0; i < bigDataSize; ++i) {
+        if (i < dataSize) {
+            dataHostPointer[i] = i;
+        }

-  // Free memory
-  checkCudaErrors(cudaFreeHost(dataHostPointer));
-  checkCudaErrors(cudaFreeHost(bigDataHostPointer));
-  checkCudaErrors(cudaFree(dataDevicePointer));
-  checkCudaErrors(cudaFree(bigDataDevicePointer));
+        bigDataHostPointer[bigDataSize - i - 1] = i;
+    }

-  sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  sdkDeleteTimer(&timer);
+    checkCudaErrors(cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
+    checkCudaErrors(cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
+    checkCudaErrors(
+        cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(
+        bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream));

-  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    // Make a window for the buffer of interest
+    accessPolicyWindow.base_ptr        = (void *)dataDevicePointer;
+    accessPolicyWindow.num_bytes       = dataSize * sizeof(int);
+    accessPolicyWindow.hitRatio        = 1.f;
+    accessPolicyWindow.hitProp         = cudaAccessPropertyPersisting;
+    accessPolicyWindow.missProp        = cudaAccessPropertyNormal;
+    streamAttrValue.accessPolicyWindow = accessPolicyWindow;
+
+    // Assign window to stream
+    checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
+
+    // Demote any previous persisting lines
+    checkCudaErrors(cudaCtxResetPersistingL2Cache());
+
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
+        dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
+
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    // check if kernel execution generated and error
+    getLastCudaError("Kernel execution failed");
+
+    // Free memory
+    checkCudaErrors(cudaFreeHost(dataHostPointer));
+    checkCudaErrors(cudaFreeHost(bigDataHostPointer));
+    checkCudaErrors(cudaFree(dataDevicePointer));
+    checkCudaErrors(cudaFree(bigDataDevicePointer));
+
+    sdkStopTimer(&timer);
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    sdkDeleteTimer(&timer);
+
+    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/simpleCUDA2GL/README.md
+++ b/Samples/0_Introduction/simpleCUDA2GL/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
-
--- a/Samples/0_Introduction/simpleCUDA2GL/main.cpp
+++ b/Samples/0_Introduction/simpleCUDA2GL/main.cpp
--- a/Samples/0_Introduction/simpleCUDA2GL/simpleCUDA2GL.cu
+++ b/Samples/0_Introduction/simpleCUDA2GL/simpleCUDA2GL.cu
@ -35,28 +35,30 @@ __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }
 __device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }

 // convert floating point rgb color to 8-bit integer
-__device__ int rgbToInt(float r, float g, float b) {
-  r = clamp(r, 0.0f, 255.0f);
-  g = clamp(g, 0.0f, 255.0f);
-  b = clamp(b, 0.0f, 255.0f);
-  return (int(b) << 16) | (int(g) << 8) | int(r);
+__device__ int rgbToInt(float r, float g, float b)
+{
+    r = clamp(r, 0.0f, 255.0f);
+    g = clamp(g, 0.0f, 255.0f);
+    b = clamp(b, 0.0f, 255.0f);
+    return (int(b) << 16) | (int(g) << 8) | int(r);
 }

-__global__ void cudaProcess(unsigned int *g_odata, int imgw) {
-  extern __shared__ uchar4 sdata[];
+__global__ void cudaProcess(unsigned int *g_odata, int imgw)
+{
+    extern __shared__ uchar4 sdata[];

-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-  int bw = blockDim.x;
-  int bh = blockDim.y;
-  int x = blockIdx.x * bw + tx;
-  int y = blockIdx.y * bh + ty;
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int bw = blockDim.x;
+    int bh = blockDim.y;
+    int x  = blockIdx.x * bw + tx;
+    int y  = blockIdx.y * bh + ty;

-  uchar4 c4 = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0);
-  g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
+    uchar4 c4             = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0);
+    g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
 }

-extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
-                                   unsigned int *g_odata, int imgw) {
-  cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
+extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw)
+{
+    cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
 }
--- a/Samples/0_Introduction/simpleCallback/multithreading.cpp
+++ b/Samples/0_Introduction/simpleCallback/multithreading.cpp
@ -29,115 +29,124 @@

 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 // Create thread
-CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
-  return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
+CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
+{
+    return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
 }

 // Wait for thread to finish
-void cutEndThread(CUTThread thread) {
-  WaitForSingleObject(thread, INFINITE);
-  CloseHandle(thread);
+void cutEndThread(CUTThread thread)
+{
+    WaitForSingleObject(thread, INFINITE);
+    CloseHandle(thread);
 }

 // Wait for multiple threads
-void cutWaitForThreads(const CUTThread *threads, int num) {
-  WaitForMultipleObjects(num, threads, true, INFINITE);
+void cutWaitForThreads(const CUTThread *threads, int num)
+{
+    WaitForMultipleObjects(num, threads, true, INFINITE);

-  for (int i = 0; i < num; i++) {
-    CloseHandle(threads[i]);
-  }
+    for (int i = 0; i < num; i++) {
+        CloseHandle(threads[i]);
+    }
 }

 // Create barrier.
-CUTBarrier cutCreateBarrier(int releaseCount) {
-  CUTBarrier barrier;
+CUTBarrier cutCreateBarrier(int releaseCount)
+{
+    CUTBarrier barrier;

-  InitializeCriticalSection(&barrier.criticalSection);
-  barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent"));
-  barrier.count = 0;
-  barrier.releaseCount = releaseCount;
+    InitializeCriticalSection(&barrier.criticalSection);
+    barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent"));
+    barrier.count        = 0;
+    barrier.releaseCount = releaseCount;

-  return barrier;
+    return barrier;
 }

 // Increment barrier. (execution continues)
-void cutIncrementBarrier(CUTBarrier *barrier) {
-  int myBarrierCount;
-  EnterCriticalSection(&barrier->criticalSection);
-  myBarrierCount = ++barrier->count;
-  LeaveCriticalSection(&barrier->criticalSection);
+void cutIncrementBarrier(CUTBarrier *barrier)
+{
+    int myBarrierCount;
+    EnterCriticalSection(&barrier->criticalSection);
+    myBarrierCount = ++barrier->count;
+    LeaveCriticalSection(&barrier->criticalSection);

-  if (myBarrierCount >= barrier->releaseCount) {
-    SetEvent(barrier->barrierEvent);
-  }
+    if (myBarrierCount >= barrier->releaseCount) {
+        SetEvent(barrier->barrierEvent);
+    }
 }

 // Wait for barrier release.
-void cutWaitForBarrier(CUTBarrier *barrier) {
-  WaitForSingleObject(barrier->barrierEvent, INFINITE);
-}
+void cutWaitForBarrier(CUTBarrier *barrier) { WaitForSingleObject(barrier->barrierEvent, INFINITE); }

 // Destroy barrier
 void cutDestroyBarrier(CUTBarrier *barrier) {}

 #else
 // Create thread
-CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
-  pthread_t thread;
-  pthread_create(&thread, NULL, func, data);
-  return thread;
+CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
+{
+    pthread_t thread;
+    pthread_create(&thread, NULL, func, data);
+    return thread;
 }

 // Wait for thread to finish
 void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }

 // Wait for multiple threads
-void cutWaitForThreads(const CUTThread *threads, int num) {
-  for (int i = 0; i < num; i++) {
-    cutEndThread(threads[i]);
-  }
+void cutWaitForThreads(const CUTThread *threads, int num)
+{
+    for (int i = 0; i < num; i++) {
+        cutEndThread(threads[i]);
+    }
 }

 // Create barrier.
-CUTBarrier cutCreateBarrier(int releaseCount) {
-  CUTBarrier barrier;
+CUTBarrier cutCreateBarrier(int releaseCount)
+{
+    CUTBarrier barrier;

-  barrier.count = 0;
-  barrier.releaseCount = releaseCount;
+    barrier.count        = 0;
+    barrier.releaseCount = releaseCount;

-  pthread_mutex_init(&barrier.mutex, 0);
-  pthread_cond_init(&barrier.conditionVariable, 0);
+    pthread_mutex_init(&barrier.mutex, 0);
+    pthread_cond_init(&barrier.conditionVariable, 0);

-  return barrier;
+    return barrier;
 }

 // Increment barrier. (execution continues)
-void cutIncrementBarrier(CUTBarrier *barrier) {
-  int myBarrierCount;
-  pthread_mutex_lock(&barrier->mutex);
-  myBarrierCount = ++barrier->count;
-  pthread_mutex_unlock(&barrier->mutex);
+void cutIncrementBarrier(CUTBarrier *barrier)
+{
+    int myBarrierCount;
+    pthread_mutex_lock(&barrier->mutex);
+    myBarrierCount = ++barrier->count;
+    pthread_mutex_unlock(&barrier->mutex);

-  if (myBarrierCount >= barrier->releaseCount) {
-    pthread_cond_signal(&barrier->conditionVariable);
-  }
+    if (myBarrierCount >= barrier->releaseCount) {
+        pthread_cond_signal(&barrier->conditionVariable);
+    }
 }

 // Wait for barrier release.
-void cutWaitForBarrier(CUTBarrier *barrier) {
-  pthread_mutex_lock(&barrier->mutex);
+void cutWaitForBarrier(CUTBarrier *barrier)
+{
+    pthread_mutex_lock(&barrier->mutex);

-  while (barrier->count < barrier->releaseCount) {
-    pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex);
-  }
+    while (barrier->count < barrier->releaseCount) {
+        pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex);
+    }

-  pthread_mutex_unlock(&barrier->mutex);
+    pthread_mutex_unlock(&barrier->mutex);
 }

 // Destroy barrier
-void cutDestroyBarrier(CUTBarrier *barrier) {
-  pthread_mutex_destroy(&barrier->mutex);
-  pthread_cond_destroy(&barrier->conditionVariable);
+void cutDestroyBarrier(CUTBarrier *barrier)
+{
+    pthread_mutex_destroy(&barrier->mutex);
+    pthread_cond_destroy(&barrier->conditionVariable);
 }

 #endif
--- a/Samples/0_Introduction/simpleCallback/multithreading.h
+++ b/Samples/0_Introduction/simpleCallback/multithreading.h
@ -37,15 +37,16 @@
 typedef HANDLE CUTThread;
 typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);

-struct CUTBarrier {
-  CRITICAL_SECTION criticalSection;
-  HANDLE barrierEvent;
-  int releaseCount;
-  int count;
+struct CUTBarrier
+{
+    CRITICAL_SECTION criticalSection;
+    HANDLE           barrierEvent;
+    int              releaseCount;
+    int              count;
 };

 #define CUT_THREADPROC unsigned WINAPI
-#define CUT_THREADEND return 0
+#define CUT_THREADEND  return 0

 #else
 // POSIX threads.
@ -55,44 +56,46 @@ typedef pthread_t CUTThread;
 typedef void *(*CUT_THREADROUTINE)(void *);

 #define CUT_THREADPROC void *
-#define CUT_THREADEND return 0
+#define CUT_THREADEND  return 0

-struct CUTBarrier {
-  pthread_mutex_t mutex;
-  pthread_cond_t conditionVariable;
-  int releaseCount;
-  int count;
+struct CUTBarrier
+{
+    pthread_mutex_t mutex;
+    pthread_cond_t  conditionVariable;
+    int             releaseCount;
+    int             count;
 };

 #endif

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

-// Create thread.
-CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
+    // Create thread.
+    CUTThread cutStartThread(CUT_THREADROUTINE, void *data);

-// Wait for thread to finish.
-void cutEndThread(CUTThread thread);
+    // Wait for thread to finish.
+    void cutEndThread(CUTThread thread);

-// Wait for multiple threads.
-void cutWaitForThreads(const CUTThread *threads, int num);
+    // Wait for multiple threads.
+    void cutWaitForThreads(const CUTThread *threads, int num);

-// Create barrier.
-CUTBarrier cutCreateBarrier(int releaseCount);
+    // Create barrier.
+    CUTBarrier cutCreateBarrier(int releaseCount);

-// Increment barrier. (execution continues)
-void cutIncrementBarrier(CUTBarrier *barrier);
+    // Increment barrier. (execution continues)
+    void cutIncrementBarrier(CUTBarrier *barrier);

-// Wait for barrier release.
-void cutWaitForBarrier(CUTBarrier *barrier);
+    // Wait for barrier release.
+    void cutWaitForBarrier(CUTBarrier *barrier);

-// Destroy barrier
-void cutDestroyBarrier(CUTBarrier *barrier);
+    // Destroy barrier
+    void cutDestroyBarrier(CUTBarrier *barrier);

 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif

-#endif  // MULTITHREADING_H
+#endif // MULTITHREADING_H
--- a/Samples/0_Introduction/simpleCallback/simpleCallback.cu
+++ b/Samples/0_Introduction/simpleCallback/simpleCallback.cu
@ -43,172 +43,173 @@
 #include <stdio.h>

 // helper functions and utilities to work with CUDA
-#include <helper_functions.h>
 #include <helper_cuda.h>
+#include <helper_functions.h>

 #include "multithreading.h"

-const int N_workloads = 8;
+const int N_workloads             = 8;
 const int N_elements_per_workload = 100000;

 CUTBarrier thread_barrier;

-void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status,
-                                void *data);
+void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void *data);

-struct heterogeneous_workload {
-  int id;
-  int cudaDeviceID;
+struct heterogeneous_workload
+{
+    int id;
+    int cudaDeviceID;

-  int *h_data;
-  int *d_data;
-  cudaStream_t stream;
+    int         *h_data;
+    int         *d_data;
+    cudaStream_t stream;

-  bool success;
+    bool success;
 };

-__global__ void incKernel(int *data, int N) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void incKernel(int *data, int N)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;

-  if (i < N) data[i]++;
+    if (i < N)
+        data[i]++;
 }

-CUT_THREADPROC launch(void *void_arg) {
-  heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
+CUT_THREADPROC launch(void *void_arg)
+{
+    heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;

-  // Select GPU for this CPU thread
-  checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
+    // Select GPU for this CPU thread
+    checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));

-  // Allocate Resources
-  checkCudaErrors(cudaStreamCreate(&workload->stream));
-  checkCudaErrors(
-      cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
-  checkCudaErrors(cudaHostAlloc(&workload->h_data,
-                                N_elements_per_workload * sizeof(int),
-                                cudaHostAllocPortable));
+    // Allocate Resources
+    checkCudaErrors(cudaStreamCreate(&workload->stream));
+    checkCudaErrors(cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
+    checkCudaErrors(cudaHostAlloc(&workload->h_data, N_elements_per_workload * sizeof(int), cudaHostAllocPortable));

-  // CPU thread generates data
-  for (int i = 0; i < N_elements_per_workload; ++i) {
-    workload->h_data[i] = workload->id + i;
-  }
-
-  // Schedule work for GPU in CUDA stream without blocking the CPU thread
-  // Note: Dedicated streams enable concurrent execution of workloads on the GPU
-  dim3 block(512);
-  dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
-
-  checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data,
-                                  N_elements_per_workload * sizeof(int),
-                                  cudaMemcpyHostToDevice, workload->stream));
-  incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data,
-                                                  N_elements_per_workload);
-  checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data,
-                                  N_elements_per_workload * sizeof(int),
-                                  cudaMemcpyDeviceToHost, workload->stream));
-
-  // New in CUDA 5.0: Add a CPU callback which is called once all currently
-  // pending operations in the CUDA stream have finished
-  checkCudaErrors(
-      cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
-
-  CUT_THREADEND;
-  // CPU thread end of life, GPU continues to process data...
-}
-
-CUT_THREADPROC postprocess(void *void_arg) {
-  heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
-  // ... GPU is done with processing, continue on new CPU thread...
-
-  // Select GPU for this CPU thread
-  checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
-
-  // CPU thread consumes results from GPU
-  workload->success = true;
-
-  for (int i = 0; i < N_workloads; ++i) {
-    workload->success &= workload->h_data[i] == i + workload->id + 1;
-  }
-
-  // Free Resources
-  checkCudaErrors(cudaFree(workload->d_data));
-  checkCudaErrors(cudaFreeHost(workload->h_data));
-  checkCudaErrors(cudaStreamDestroy(workload->stream));
-
-  // Signal the end of the heterogeneous workload to main thread
-  cutIncrementBarrier(&thread_barrier);
-
-  CUT_THREADEND;
-}
-
-void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
-                                void *data) {
-  // Check status of GPU after stream operations are done
-  checkCudaErrors(status);
-
-  // Spawn new CPU worker thread and continue processing on the CPU
-  cutStartThread(postprocess, data);
-}
-
-int main(int argc, char **argv) {
-  int N_gpus, max_gpus = 0;
-  int gpuInfo[32];  // assume a maximum of 32 GPUs in a system configuration
-
-  printf("Starting simpleCallback\n");
-
-  checkCudaErrors(cudaGetDeviceCount(&N_gpus));
-  printf("Found %d CUDA capable GPUs\n", N_gpus);
-
-  if (N_gpus > 32) {
-    printf("simpleCallback only supports 32 GPU(s)\n");
-  }
-
-  for (int devid = 0; devid < N_gpus; devid++) {
-    int SMversion;
-    cudaDeviceProp deviceProp;
-    cudaSetDevice(devid);
-    cudaGetDeviceProperties(&deviceProp, devid);
-    SMversion = deviceProp.major << 4 + deviceProp.minor;
-    printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name,
-           deviceProp.major, deviceProp.minor);
-    printf(", %s GPU Callback Functions\n",
-           (SMversion >= 0x11) ? "capable" : "NOT capable");
-
-    if (SMversion >= 0x11) {
-      gpuInfo[max_gpus++] = devid;
+    // CPU thread generates data
+    for (int i = 0; i < N_elements_per_workload; ++i) {
+        workload->h_data[i] = workload->id + i;
    }
-  }

-  printf("%d GPUs available to run Callback Functions\n", max_gpus);
+    // Schedule work for GPU in CUDA stream without blocking the CPU thread
+    // Note: Dedicated streams enable concurrent execution of workloads on the GPU
+    dim3 block(512);
+    dim3 grid((N_elements_per_workload + block.x - 1) / block.x);

-  heterogeneous_workload *workloads;
-  workloads = (heterogeneous_workload *)malloc(N_workloads *
-                                               sizeof(heterogeneous_workload));
-  ;
-  thread_barrier = cutCreateBarrier(N_workloads);
+    checkCudaErrors(cudaMemcpyAsync(workload->d_data,
+                                    workload->h_data,
+                                    N_elements_per_workload * sizeof(int),
+                                    cudaMemcpyHostToDevice,
+                                    workload->stream));
+    incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data, N_elements_per_workload);
+    checkCudaErrors(cudaMemcpyAsync(workload->h_data,
+                                    workload->d_data,
+                                    N_elements_per_workload * sizeof(int),
+                                    cudaMemcpyDeviceToHost,
+                                    workload->stream));

-  // Main thread spawns a CPU worker thread for each heterogeneous workload
-  printf("Starting %d heterogeneous computing workloads\n", N_workloads);
+    // New in CUDA 5.0: Add a CPU callback which is called once all currently
+    // pending operations in the CUDA stream have finished
+    checkCudaErrors(cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));

-  for (int i = 0; i < N_workloads; ++i) {
-    workloads[i].id = i;
-    workloads[i].cudaDeviceID = gpuInfo[i % max_gpus];  // i % N_gpus;
-
-    cutStartThread(launch, &workloads[i]);
-  }
-
-  // Sleep until all workloads have finished
-  cutWaitForBarrier(&thread_barrier);
-  printf("Total of %d workloads finished:\n", N_workloads);
-
-  bool success = true;
-
-  for (int i = 0; i < N_workloads; ++i) {
-    success &= workloads[i].success;
-  }
-
-  printf("%s\n", success ? "Success" : "Failure");
-
-  free(workloads);
-
-  exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
+    CUT_THREADEND;
+    // CPU thread end of life, GPU continues to process data...
+}
+
+CUT_THREADPROC postprocess(void *void_arg)
+{
+    heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
+    // ... GPU is done with processing, continue on new CPU thread...
+
+    // Select GPU for this CPU thread
+    checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
+
+    // CPU thread consumes results from GPU
+    workload->success = true;
+
+    for (int i = 0; i < N_workloads; ++i) {
+        workload->success &= workload->h_data[i] == i + workload->id + 1;
+    }
+
+    // Free Resources
+    checkCudaErrors(cudaFree(workload->d_data));
+    checkCudaErrors(cudaFreeHost(workload->h_data));
+    checkCudaErrors(cudaStreamDestroy(workload->stream));
+
+    // Signal the end of the heterogeneous workload to main thread
+    cutIncrementBarrier(&thread_barrier);
+
+    CUT_THREADEND;
+}
+
+void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data)
+{
+    // Check status of GPU after stream operations are done
+    checkCudaErrors(status);
+
+    // Spawn new CPU worker thread and continue processing on the CPU
+    cutStartThread(postprocess, data);
+}
+
+int main(int argc, char **argv)
+{
+    int N_gpus, max_gpus = 0;
+    int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
+
+    printf("Starting simpleCallback\n");
+
+    checkCudaErrors(cudaGetDeviceCount(&N_gpus));
+    printf("Found %d CUDA capable GPUs\n", N_gpus);
+
+    if (N_gpus > 32) {
+        printf("simpleCallback only supports 32 GPU(s)\n");
+    }
+
+    for (int devid = 0; devid < N_gpus; devid++) {
+        int            SMversion;
+        cudaDeviceProp deviceProp;
+        cudaSetDevice(devid);
+        cudaGetDeviceProperties(&deviceProp, devid);
+        SMversion = deviceProp.major << 4 + deviceProp.minor;
+        printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, deviceProp.major, deviceProp.minor);
+        printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");
+
+        if (SMversion >= 0x11) {
+            gpuInfo[max_gpus++] = devid;
+        }
+    }
+
+    printf("%d GPUs available to run Callback Functions\n", max_gpus);
+
+    heterogeneous_workload *workloads;
+    workloads = (heterogeneous_workload *)malloc(N_workloads * sizeof(heterogeneous_workload));
+    ;
+    thread_barrier = cutCreateBarrier(N_workloads);
+
+    // Main thread spawns a CPU worker thread for each heterogeneous workload
+    printf("Starting %d heterogeneous computing workloads\n", N_workloads);
+
+    for (int i = 0; i < N_workloads; ++i) {
+        workloads[i].id           = i;
+        workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus;
+
+        cutStartThread(launch, &workloads[i]);
+    }
+
+    // Sleep until all workloads have finished
+    cutWaitForBarrier(&thread_barrier);
+    printf("Total of %d workloads finished:\n", N_workloads);
+
+    bool success = true;
+
+    for (int i = 0; i < N_workloads; ++i) {
+        success &= workloads[i].success;
+    }
+
+    printf("%s\n", success ? "Success" : "Failure");
+
+    free(workloads);
+
+    exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/simpleCooperativeGroups/simpleCooperativeGroups.cu
+++ b/Samples/0_Introduction/simpleCooperativeGroups/simpleCooperativeGroups.cu
@ -38,8 +38,8 @@
 *
 */

-#include <stdio.h>
 #include <cooperative_groups.h>
+#include <stdio.h>

 using namespace cooperative_groups;

@ -49,35 +49,36 @@ using namespace cooperative_groups;
 * calculates the sum of val across the group g. The workspace array, x,
 * must be large enough to contain g.size() integers.
 */
-__device__ int sumReduction(thread_group g, int *x, int val) {
-  // rank of this thread in the group
-  int lane = g.thread_rank();
+__device__ int sumReduction(thread_group g, int *x, int val)
+{
+    // rank of this thread in the group
+    int lane = g.thread_rank();

-  // for each iteration of this loop, the number of threads active in the
-  // reduction, i, is halved, and each active thread (with index [lane])
-  // performs a single summation of it's own value with that
-  // of a "partner" (with index [lane+i]).
-  for (int i = g.size() / 2; i > 0; i /= 2) {
-    // store value for this thread in temporary array
-    x[lane] = val;
+    // for each iteration of this loop, the number of threads active in the
+    // reduction, i, is halved, and each active thread (with index [lane])
+    // performs a single summation of it's own value with that
+    // of a "partner" (with index [lane+i]).
+    for (int i = g.size() / 2; i > 0; i /= 2) {
+        // store value for this thread in temporary array
+        x[lane] = val;

-    // synchronize all threads in group
-    g.sync();
+        // synchronize all threads in group
+        g.sync();

-    if (lane < i)
-      // active threads perform summation of their value with
-      // their partner's value
-      val += x[lane + i];
+        if (lane < i)
+            // active threads perform summation of their value with
+            // their partner's value
+            val += x[lane + i];

-    // synchronize all threads in group
-    g.sync();
-  }
+        // synchronize all threads in group
+        g.sync();
+    }

-  // master thread in group returns result, and others return -1.
-  if (g.thread_rank() == 0)
-    return val;
-  else
-    return -1;
+    // master thread in group returns result, and others return -1.
+    if (g.thread_rank() == 0)
+        return val;
+    else
+        return -1;
 }

 /**
@ -85,93 +86,92 @@ __device__ int sumReduction(thread_group g, int *x, int val) {
 *
 * Creates cooperative groups and performs reductions
 */
-__global__ void cgkernel() {
-  // threadBlockGroup includes all threads in the block
-  thread_block threadBlockGroup = this_thread_block();
-  int threadBlockGroupSize = threadBlockGroup.size();
+__global__ void cgkernel()
+{
+    // threadBlockGroup includes all threads in the block
+    thread_block threadBlockGroup     = this_thread_block();
+    int          threadBlockGroupSize = threadBlockGroup.size();

-  // workspace array in shared memory required for reduction
-  extern __shared__ int workspace[];
+    // workspace array in shared memory required for reduction
+    extern __shared__ int workspace[];

-  int input, output, expectedOutput;
+    int input, output, expectedOutput;

-  // input to reduction, for each thread, is its' rank in the group
-  input = threadBlockGroup.thread_rank();
+    // input to reduction, for each thread, is its' rank in the group
+    input = threadBlockGroup.thread_rank();

-  // expected output from analytical formula (n-1)(n)/2
-  // (noting that indexing starts at 0 rather than 1)
-  expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
+    // expected output from analytical formula (n-1)(n)/2
+    // (noting that indexing starts at 0 rather than 1)
+    expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;

-  // perform reduction
-  output = sumReduction(threadBlockGroup, workspace, input);
+    // perform reduction
+    output = sumReduction(threadBlockGroup, workspace, input);

-  // master thread in group prints out result
-  if (threadBlockGroup.thread_rank() == 0) {
-    printf(
-        " Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
-        (int)threadBlockGroup.size() - 1, output, expectedOutput);
+    // master thread in group prints out result
+    if (threadBlockGroup.thread_rank() == 0) {
+        printf(" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
+               (int)threadBlockGroup.size() - 1,
+               output,
+               expectedOutput);

-    printf(" Now creating %d groups, each of size 16 threads:\n\n",
-           (int)threadBlockGroup.size() / 16);
-  }
+        printf(" Now creating %d groups, each of size 16 threads:\n\n", (int)threadBlockGroup.size() / 16);
+    }

-  threadBlockGroup.sync();
+    threadBlockGroup.sync();

-  // each tiledPartition16 group includes 16 threads
-  thread_block_tile<16> tiledPartition16 =
-      tiled_partition<16>(threadBlockGroup);
+    // each tiledPartition16 group includes 16 threads
+    thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup);

-  // This offset allows each group to have its own unique area in the workspace
-  // array
-  int workspaceOffset =
-      threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
+    // This offset allows each group to have its own unique area in the workspace
+    // array
+    int workspaceOffset = threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();

-  // input to reduction, for each thread, is its' rank in the group
-  input = tiledPartition16.thread_rank();
+    // input to reduction, for each thread, is its' rank in the group
+    input = tiledPartition16.thread_rank();

-  // expected output from analytical formula (n-1)(n)/2
-  // (noting that indexing starts at 0 rather than 1)
-  expectedOutput = 15 * 16 / 2;
+    // expected output from analytical formula (n-1)(n)/2
+    // (noting that indexing starts at 0 rather than 1)
+    expectedOutput = 15 * 16 / 2;

-  // Perform reduction
-  output = sumReduction(tiledPartition16, workspace + workspaceOffset, input);
+    // Perform reduction
+    output = sumReduction(tiledPartition16, workspace + workspaceOffset, input);

-  // each master thread prints out result
-  if (tiledPartition16.thread_rank() == 0)
-    printf(
-        "   Sum of all ranks 0..15 in this tiledPartition16 group is %d "
-        "(expected %d)\n",
-        output, expectedOutput);
+    // each master thread prints out result
+    if (tiledPartition16.thread_rank() == 0)
+        printf("   Sum of all ranks 0..15 in this tiledPartition16 group is %d "
+               "(expected %d)\n",
+               output,
+               expectedOutput);

-  return;
+    return;
 }

 /**
 * Host main routine
 */
-int main() {
-  // Error code to check return values for CUDA calls
-  cudaError_t err;
+int main()
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err;

-  // Launch the kernel
+    // Launch the kernel

-  int blocksPerGrid = 1;
-  int threadsPerBlock = 64;
+    int blocksPerGrid   = 1;
+    int threadsPerBlock = 64;

-  printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock);
+    printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock);

-  // we use the optional third argument to specify the size
-  // of shared memory required in the kernel
-  cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>();
-  err = cudaDeviceSynchronize();
+    // we use the optional third argument to specify the size
+    // of shared memory required in the kernel
+    cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>();
+    err = cudaDeviceSynchronize();

-  if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to launch kernel (error code %s)!\n",
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }

-  printf("\n...Done.\n\n");
+    printf("\n...Done.\n\n");

-  return 0;
+    return 0;
 }
--- a/Samples/0_Introduction/simpleCubemapTexture/simpleCubemapTexture.cu
+++ b/Samples/0_Introduction/simpleCubemapTexture/simpleCubemapTexture.cu
@ -26,27 +26,27 @@
 */

 /*
-* This sample demonstrates how to use texture fetches from layered 2D textures
-* in CUDA C
-*
-* This sample first generates a 3D input data array for the layered texture
-* and the expected output. Then it starts CUDA C kernels, one for each layer,
-* which fetch their layer's texture data (using normalized texture coordinates)
-* transform it to the expected output, and write it to a 3D output data array.
-*/
+ * This sample demonstrates how to use texture fetches from layered 2D textures
+ * in CUDA C
+ *
+ * This sample first generates a 3D input data array for the layered texture
+ * and the expected output. Then it starts CUDA C kernels, one for each layer,
+ * which fetch their layer's texture data (using normalized texture coordinates)
+ * transform it to the expected output, and write it to a 3D output data array.
+ */

 // includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 // includes CUDA
 #include <cuda_runtime.h>

 // helper functions and utilities to work with CUDA
-#include <helper_functions.h>
 #include <helper_cuda.h>
+#include <helper_functions.h>

 static const char *sSDKname = "simpleCubemapTexture";

@ -56,213 +56,207 @@ static const char *sSDKname = "simpleCubemapTexture";
 //! Transform a cubemap face of a linear buffe using cubemap texture lookups
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *g_odata, int width,
-                                cudaTextureObject_t tex) {
-  // calculate this thread's data point
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
+{
+    // calculate this thread's data point
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

-  // 0.5f offset and division are necessary to access the original data points
-  // in the texture (such that bilinear interpolation will not be activated).
-  // For details, see also CUDA Programming Guide, Appendix D
+    // 0.5f offset and division are necessary to access the original data points
+    // in the texture (such that bilinear interpolation will not be activated).
+    // For details, see also CUDA Programming Guide, Appendix D

-  float u = ((x + 0.5f) / (float)width) * 2.f - 1.f;
-  float v = ((y + 0.5f) / (float)width) * 2.f - 1.f;
+    float u = ((x + 0.5f) / (float)width) * 2.f - 1.f;
+    float v = ((y + 0.5f) / (float)width) * 2.f - 1.f;

-  float cx, cy, cz;
+    float cx, cy, cz;

-  for (unsigned int face = 0; face < 6; face++) {
-    // Layer 0 is positive X face
-    if (face == 0) {
-      cx = 1;
-      cy = -v;
-      cz = -u;
-    }
-    // Layer 1 is negative X face
-    else if (face == 1) {
-      cx = -1;
-      cy = -v;
-      cz = u;
-    }
-    // Layer 2 is positive Y face
-    else if (face == 2) {
-      cx = u;
-      cy = 1;
-      cz = v;
-    }
-    // Layer 3 is negative Y face
-    else if (face == 3) {
-      cx = u;
-      cy = -1;
-      cz = -v;
-    }
-    // Layer 4 is positive Z face
-    else if (face == 4) {
-      cx = u;
-      cy = -v;
-      cz = 1;
-    }
-    // Layer 4 is negative Z face
-    else if (face == 5) {
-      cx = -u;
-      cy = -v;
-      cz = -1;
-    }
+    for (unsigned int face = 0; face < 6; face++) {
+        // Layer 0 is positive X face
+        if (face == 0) {
+            cx = 1;
+            cy = -v;
+            cz = -u;
+        }
+        // Layer 1 is negative X face
+        else if (face == 1) {
+            cx = -1;
+            cy = -v;
+            cz = u;
+        }
+        // Layer 2 is positive Y face
+        else if (face == 2) {
+            cx = u;
+            cy = 1;
+            cz = v;
+        }
+        // Layer 3 is negative Y face
+        else if (face == 3) {
+            cx = u;
+            cy = -1;
+            cz = -v;
+        }
+        // Layer 4 is positive Z face
+        else if (face == 4) {
+            cx = u;
+            cy = -v;
+            cz = 1;
+        }
+        // Layer 4 is negative Z face
+        else if (face == 5) {
+            cx = -u;
+            cy = -v;
+            cz = -1;
+        }

-    // read from texture, do expected transformation and write to global memory
-    g_odata[face * width * width + y * width + x] =
-        -texCubemap<float>(tex, cx, cy, cz);
-  }
+        // read from texture, do expected transformation and write to global memory
+        g_odata[face * width * width + y * width + x] = -texCubemap<float>(tex, cx, cy, cz);
+    }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  // use command-line specified CUDA device, otherwise use device with highest
-  // Gflops/s
-  int devID = findCudaDevice(argc, (const char **)argv);
+int main(int argc, char **argv)
+{
+    // use command-line specified CUDA device, otherwise use device with highest
+    // Gflops/s
+    int devID = findCudaDevice(argc, (const char **)argv);

-  bool bResult = true;
+    bool bResult = true;

-  // get number of SMs on this GPU
-  cudaDeviceProp deviceProps;
+    // get number of SMs on this GPU
+    cudaDeviceProp deviceProps;

-  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-  printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
-         deviceProps.multiProcessorCount);
-  printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
+    printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);

-  if (deviceProps.major < 2) {
-    printf(
-        "%s requires SM 2.0 or higher for support of Texture Arrays.  Test "
-        "will exit... \n",
-        sSDKname);
+    if (deviceProps.major < 2) {
+        printf("%s requires SM 2.0 or higher for support of Texture Arrays.  Test "
+               "will exit... \n",
+               sSDKname);

-    exit(EXIT_WAIVED);
-  }
-
-  // generate input data for layered texture
-  unsigned int width = 64, num_faces = 6, num_layers = 1;
-  unsigned int cubemap_size = width * width * num_faces;
-  unsigned int size = cubemap_size * num_layers * sizeof(float);
-  float *h_data = (float *)malloc(size);
-
-  for (int i = 0; i < (int)(cubemap_size * num_layers); i++) {
-    h_data[i] = (float)i;
-  }
-
-  // this is the expected transformation of the input data (the expected output)
-  float *h_data_ref = (float *)malloc(size);
-
-  for (unsigned int layer = 0; layer < num_layers; layer++) {
-    for (int i = 0; i < (int)(cubemap_size); i++) {
-      h_data_ref[layer * cubemap_size + i] =
-          -h_data[layer * cubemap_size + i] + layer;
+        exit(EXIT_WAIVED);
    }
-  }

-  // allocate device memory for result
-  float *d_data = NULL;
-  checkCudaErrors(cudaMalloc((void **)&d_data, size));
+    // generate input data for layered texture
+    unsigned int width = 64, num_faces = 6, num_layers = 1;
+    unsigned int cubemap_size = width * width * num_faces;
+    unsigned int size         = cubemap_size * num_layers * sizeof(float);
+    float       *h_data       = (float *)malloc(size);

-  // allocate array and copy image data
-  cudaChannelFormatDesc channelDesc =
-      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
-  cudaArray *cu_3darray;
-  //    checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
-  //    make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
-  checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
-                                    make_cudaExtent(width, width, num_faces),
-                                    cudaArrayCubemap));
-  cudaMemcpy3DParms myparms = {0};
-  myparms.srcPos = make_cudaPos(0, 0, 0);
-  myparms.dstPos = make_cudaPos(0, 0, 0);
-  myparms.srcPtr =
-      make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
-  myparms.dstArray = cu_3darray;
-  myparms.extent = make_cudaExtent(width, width, num_faces);
-  myparms.kind = cudaMemcpyHostToDevice;
-  checkCudaErrors(cudaMemcpy3D(&myparms));
+    for (int i = 0; i < (int)(cubemap_size * num_layers); i++) {
+        h_data[i] = (float)i;
+    }

-  cudaTextureObject_t tex;
-  cudaResourceDesc texRes;
-  memset(&texRes, 0, sizeof(cudaResourceDesc));
+    // this is the expected transformation of the input data (the expected output)
+    float *h_data_ref = (float *)malloc(size);

-  texRes.resType = cudaResourceTypeArray;
-  texRes.res.array.array = cu_3darray;
+    for (unsigned int layer = 0; layer < num_layers; layer++) {
+        for (int i = 0; i < (int)(cubemap_size); i++) {
+            h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer;
+        }
+    }

-  cudaTextureDesc texDescr;
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    // allocate device memory for result
+    float *d_data = NULL;
+    checkCudaErrors(cudaMalloc((void **)&d_data, size));

-  texDescr.normalizedCoords = true;
-  texDescr.filterMode = cudaFilterModeLinear;
-  texDescr.addressMode[0] = cudaAddressModeWrap;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
-  texDescr.addressMode[2] = cudaAddressModeWrap;
-  texDescr.readMode = cudaReadModeElementType;
+    // allocate array and copy image data
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+    cudaArray            *cu_3darray;
+    //    checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
+    //    make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
+    checkCudaErrors(
+        cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap));
+    cudaMemcpy3DParms myparms = {0};
+    myparms.srcPos            = make_cudaPos(0, 0, 0);
+    myparms.dstPos            = make_cudaPos(0, 0, 0);
+    myparms.srcPtr            = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
+    myparms.dstArray          = cu_3darray;
+    myparms.extent            = make_cudaExtent(width, width, num_faces);
+    myparms.kind              = cudaMemcpyHostToDevice;
+    checkCudaErrors(cudaMemcpy3D(&myparms));

-  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
+    cudaTextureObject_t tex;
+    cudaResourceDesc    texRes;
+    memset(&texRes, 0, sizeof(cudaResourceDesc));

-  dim3 dimBlock(8, 8, 1);
-  dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
+    texRes.resType         = cudaResourceTypeArray;
+    texRes.res.array.array = cu_3darray;

-  printf(
-      "Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
-      "block has 8 x 8 threads\n",
-      width, num_layers, dimGrid.x, dimGrid.y);
+    cudaTextureDesc texDescr;
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));

-  transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
-                                         tex);  // warmup (for better timing)
+    texDescr.normalizedCoords = true;
+    texDescr.filterMode       = cudaFilterModeLinear;
+    texDescr.addressMode[0]   = cudaAddressModeWrap;
+    texDescr.addressMode[1]   = cudaAddressModeWrap;
+    texDescr.addressMode[2]   = cudaAddressModeWrap;
+    texDescr.readMode         = cudaReadModeElementType;

-  // check if kernel execution generated an error
-  getLastCudaError("warmup Kernel execution failed");
+    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));

-  checkCudaErrors(cudaDeviceSynchronize());
+    dim3 dimBlock(8, 8, 1);
+    dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);

-  StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
+    printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
+           "block has 8 x 8 threads\n",
+           width,
+           num_layers,
+           dimGrid.x,
+           dimGrid.y);

-  // execute the kernel
-  transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, tex);
+    transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
+                                           tex); // warmup (for better timing)

-  // check if kernel execution generated an error
-  getLastCudaError("Kernel execution failed");
+    // check if kernel execution generated an error
+    getLastCudaError("warmup Kernel execution failed");

-  checkCudaErrors(cudaDeviceSynchronize());
-  sdkStopTimer(&timer);
-  printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
-  printf("%.2f Mtexlookups/sec\n",
-         (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
-  sdkDeleteTimer(&timer);
+    checkCudaErrors(cudaDeviceSynchronize());

-  // allocate mem for the result on host side
-  float *h_odata = (float *)malloc(size);
-  // copy result from device to host
-  checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
+    StopWatchInterface *timer = NULL;
+    sdkCreateTimer(&timer);
+    sdkStartTimer(&timer);

-  // write regression file if necessary
-  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
-    // write file for regression test
-    sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f,
-                        false);
-  } else {
-    printf("Comparing kernel output to expected data\n");
+    // execute the kernel
+    transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, tex);
+
+    // check if kernel execution generated an error
+    getLastCudaError("Kernel execution failed");
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    sdkStopTimer(&timer);
+    printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
+    printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
+    sdkDeleteTimer(&timer);
+
+    // allocate mem for the result on host side
+    float *h_odata = (float *)malloc(size);
+    // copy result from device to host
+    checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
+
+    // write regression file if necessary
+    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
+        // write file for regression test
+        sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f, false);
+    }
+    else {
+        printf("Comparing kernel output to expected data\n");

 #define MIN_EPSILON_ERROR 5e-3f
-    bResult =
-        compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
-  }
+        bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
+    }

-  // cleanup memory
-  free(h_data);
-  free(h_data_ref);
-  free(h_odata);
+    // cleanup memory
+    free(h_data);
+    free(h_data_ref);
+    free(h_odata);

-  checkCudaErrors(cudaDestroyTextureObject(tex));
-  checkCudaErrors(cudaFree(d_data));
-  checkCudaErrors(cudaFreeArray(cu_3darray));
+    checkCudaErrors(cudaDestroyTextureObject(tex));
+    checkCudaErrors(cudaFree(d_data));
+    checkCudaErrors(cudaFreeArray(cu_3darray));

-  exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp
+++ b/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp
@ -33,12 +33,12 @@
 */

 // Includes
+#include <cstring>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <iostream>
 #include <stdio.h>
 #include <string.h>
-#include <cstring>
-#include <iostream>

 // includes, project
 #include <helper_cuda.h>
@ -62,165 +62,165 @@ float *d_B;
 float *d_C;

 // Functions
-int CleanupNoFailure(CUcontext &cuContext);
+int  CleanupNoFailure(CUcontext &cuContext);
 void RandomInit(float *, int);
 bool findModulePath(const char *, string &, char **, ostringstream &);

-static void check(CUresult result, char const *const func,
-                  const char *const file, int const line) {
-  if (result) {
-    fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
-            static_cast<unsigned int>(result), func);
-    exit(EXIT_FAILURE);
-  }
+static void check(CUresult result, char const *const func, const char *const file, int const line)
+{
+    if (result) {
+        fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, static_cast<unsigned int>(result), func);
+        exit(EXIT_FAILURE);
+    }
 }

 #define checkCudaDrvErrors(val) check((val), #val, __FILE__, __LINE__)

 // Host code
-int main(int argc, char **argv) {
-  printf("simpleDrvRuntime..\n");
-  int N = 50000, devID = 0;
-  size_t size = N * sizeof(float);
-  CUdevice cuDevice;
-  CUfunction vecAdd_kernel;
-  CUmodule cuModule = 0;
-  CUcontext cuContext;
+int main(int argc, char **argv)
+{
+    printf("simpleDrvRuntime..\n");
+    int        N = 50000, devID = 0;
+    size_t     size = N * sizeof(float);
+    CUdevice   cuDevice;
+    CUfunction vecAdd_kernel;
+    CUmodule   cuModule = 0;
+    CUcontext  cuContext;

-  // Initialize
-  checkCudaDrvErrors(cuInit(0));
+    // Initialize
+    checkCudaDrvErrors(cuInit(0));

-  cuDevice = findCudaDevice(argc, (const char **)argv);
-  // Create context
-  checkCudaDrvErrors(cuCtxCreate(&cuContext, 0, cuDevice));
+    cuDevice = findCudaDevice(argc, (const char **)argv);
+    // Create context
+    checkCudaDrvErrors(cuCtxCreate(&cuContext, 0, cuDevice));

-  // first search for the module path before we load the results
-  string module_path;
-  ostringstream fatbin;
+    // first search for the module path before we load the results
+    string        module_path;
+    ostringstream fatbin;

-  if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) {
-    exit(EXIT_FAILURE);
-  } else {
-    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
-  }
-
-  if (!fatbin.str().size()) {
-    printf("fatbin file empty. exiting..\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Create module from binary file (FATBIN)
-  checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
-
-  // Get function handle from module
-  checkCudaDrvErrors(
-      cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
-
-  // Allocate input vectors h_A and h_B in host memory
-  checkCudaErrors(cudaMallocHost(&h_A, size));
-  checkCudaErrors(cudaMallocHost(&h_B, size));
-  checkCudaErrors(cudaMallocHost(&h_C, size));
-
-  // Initialize input vectors
-  RandomInit(h_A, N);
-  RandomInit(h_B, N);
-
-  // Allocate vectors in device memory
-  checkCudaErrors(cudaMalloc((void **)(&d_A), size));
-  checkCudaErrors(cudaMalloc((void **)(&d_B), size));
-  checkCudaErrors(cudaMalloc((void **)(&d_C), size));
-
-  cudaStream_t stream;
-  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  // Copy vectors from host memory to device memory
-  checkCudaErrors(
-      cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(
-      cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
-
-  int threadsPerBlock = 256;
-  int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
-
-  void *args[] = {&d_A, &d_B, &d_C, &N};
-
-  // Launch the CUDA kernel
-  checkCudaDrvErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
-                                    threadsPerBlock, 1, 1, 0, stream, args,
-                                    NULL));
-
-  // Copy result from device memory to host memory
-  // h_C contains the result in host memory
-  checkCudaErrors(
-      cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
-  checkCudaErrors(cudaStreamSynchronize(stream));
-  // Verify result
-  int i;
-
-  for (i = 0; i < N; ++i) {
-    float sum = h_A[i] + h_B[i];
-
-    if (fabs(h_C[i] - sum) > 1e-7f) {
-      break;
+    if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) {
+        exit(EXIT_FAILURE);
+    }
+    else {
+        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }
-  }

-  checkCudaDrvErrors(cuModuleUnload(cuModule));
-  CleanupNoFailure(cuContext);
-  printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
+    if (!fatbin.str().size()) {
+        printf("fatbin file empty. exiting..\n");
+        exit(EXIT_FAILURE);
+    }

-  exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
+    // Create module from binary file (FATBIN)
+    checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
+
+    // Get function handle from module
+    checkCudaDrvErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
+
+    // Allocate input vectors h_A and h_B in host memory
+    checkCudaErrors(cudaMallocHost(&h_A, size));
+    checkCudaErrors(cudaMallocHost(&h_B, size));
+    checkCudaErrors(cudaMallocHost(&h_C, size));
+
+    // Initialize input vectors
+    RandomInit(h_A, N);
+    RandomInit(h_B, N);
+
+    // Allocate vectors in device memory
+    checkCudaErrors(cudaMalloc((void **)(&d_A), size));
+    checkCudaErrors(cudaMalloc((void **)(&d_B), size));
+    checkCudaErrors(cudaMalloc((void **)(&d_C), size));
+
+    cudaStream_t stream;
+    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    // Copy vectors from host memory to device memory
+    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
+
+    void *args[] = {&d_A, &d_B, &d_C, &N};
+
+    // Launch the CUDA kernel
+    checkCudaDrvErrors(
+        cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, stream, args, NULL));
+
+    // Copy result from device memory to host memory
+    // h_C contains the result in host memory
+    checkCudaErrors(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    // Verify result
+    int i;
+
+    for (i = 0; i < N; ++i) {
+        float sum = h_A[i] + h_B[i];
+
+        if (fabs(h_C[i] - sum) > 1e-7f) {
+            break;
+        }
+    }
+
+    checkCudaDrvErrors(cuModuleUnload(cuModule));
+    CleanupNoFailure(cuContext);
+    printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
+
+    exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 }

-int CleanupNoFailure(CUcontext &cuContext) {
-  // Free device memory
-  checkCudaErrors(cudaFree(d_A));
-  checkCudaErrors(cudaFree(d_B));
-  checkCudaErrors(cudaFree(d_C));
+int CleanupNoFailure(CUcontext &cuContext)
+{
+    // Free device memory
+    checkCudaErrors(cudaFree(d_A));
+    checkCudaErrors(cudaFree(d_B));
+    checkCudaErrors(cudaFree(d_C));

-  // Free host memory
-  if (h_A) {
-    checkCudaErrors(cudaFreeHost(h_A));
-  }
+    // Free host memory
+    if (h_A) {
+        checkCudaErrors(cudaFreeHost(h_A));
+    }

-  if (h_B) {
-    checkCudaErrors(cudaFreeHost(h_B));
-  }
+    if (h_B) {
+        checkCudaErrors(cudaFreeHost(h_B));
+    }

-  if (h_C) {
-    checkCudaErrors(cudaFreeHost(h_C));
-  }
+    if (h_C) {
+        checkCudaErrors(cudaFreeHost(h_C));
+    }

-  checkCudaDrvErrors(cuCtxDestroy(cuContext));
+    checkCudaDrvErrors(cuCtxDestroy(cuContext));

-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
 // Allocates an array with random float entries.
-void RandomInit(float *data, int n) {
-  for (int i = 0; i < n; ++i) {
-    data[i] = rand() / (float)RAND_MAX;
-  }
-}
-
-bool inline findModulePath(const char *module_file, string &module_path,
-                           char **argv, ostringstream &ostrm) {
-  char *actual_path = sdkFindFilePath(module_file, argv[0]);
-
-  if (actual_path) {
-    module_path = actual_path;
-  } else {
-    printf("> findModulePath file not found: <%s> \n", module_file);
-    return false;
-  }
-
-  if (module_path.empty()) {
-    printf("> findModulePath could not find file: <%s> \n", module_file);
-    return false;
-  } else {
-    printf("> findModulePath found file at <%s>\n", module_path.c_str());
-    if (module_path.rfind("fatbin") != string::npos) {
-      ifstream fileIn(module_path.c_str(), ios::binary);
-      ostrm << fileIn.rdbuf();
+void RandomInit(float *data, int n)
+{
+    for (int i = 0; i < n; ++i) {
+        data[i] = rand() / (float)RAND_MAX;
+    }
+}
+
+bool inline findModulePath(const char *module_file, string &module_path, char **argv, ostringstream &ostrm)
+{
+    char *actual_path = sdkFindFilePath(module_file, argv[0]);
+
+    if (actual_path) {
+        module_path = actual_path;
+    }
+    else {
+        printf("> findModulePath file not found: <%s> \n", module_file);
+        return false;
+    }
+
+    if (module_path.empty()) {
+        printf("> findModulePath could not find file: <%s> \n", module_file);
+        return false;
+    }
+    else {
+        printf("> findModulePath found file at <%s>\n", module_path.c_str());
+        if (module_path.rfind("fatbin") != string::npos) {
+            ifstream fileIn(module_path.c_str(), ios::binary);
+            ostrm << fileIn.rdbuf();
+        }
+        return true;
    }
-    return true;
-  }
 }
--- a/Samples/0_Introduction/simpleDrvRuntime/vectorAdd_kernel.cu
+++ b/Samples/0_Introduction/simpleDrvRuntime/vectorAdd_kernel.cu
@ -34,9 +34,10 @@
 */

 // Device code
-extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
-                                         float *C, int N) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
+extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;

-  if (i < N) C[i] = A[i] + B[i];
+    if (i < N)
+        C[i] = A[i] + B[i];
 }
--- a/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu
+++ b/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu
@ -44,188 +44,188 @@ const char *sSDKsample = "hyperQ";

 // This subroutine does no real work but runs for at least the specified number
 // of clock ticks.
-__device__ void clock_block(clock_t *d_o, clock_t clock_count) {
-  unsigned int start_clock = (unsigned int)clock();
+__device__ void clock_block(clock_t *d_o, clock_t clock_count)
+{
+    unsigned int start_clock = (unsigned int)clock();

-  clock_t clock_offset = 0;
+    clock_t clock_offset = 0;

-  while (clock_offset < clock_count) {
-    unsigned int end_clock = (unsigned int)clock();
+    while (clock_offset < clock_count) {
+        unsigned int end_clock = (unsigned int)clock();

-    // The code below should work like
-    // this (thanks to modular arithmetics):
-    //
-    // clock_offset = (clock_t) (end_clock > start_clock ?
-    //                           end_clock - start_clock :
-    //                           end_clock + (0xffffffffu - start_clock));
-    //
-    // Indeed, let m = 2^32 then
-    // end - start = end + m - start (mod m).
+        // The code below should work like
+        // this (thanks to modular arithmetics):
+        //
+        // clock_offset = (clock_t) (end_clock > start_clock ?
+        //                           end_clock - start_clock :
+        //                           end_clock + (0xffffffffu - start_clock));
+        //
+        // Indeed, let m = 2^32 then
+        // end - start = end + m - start (mod m).

-    clock_offset = (clock_t)(end_clock - start_clock);
-  }
+        clock_offset = (clock_t)(end_clock - start_clock);
+    }

-  d_o[0] = clock_offset;
+    d_o[0] = clock_offset;
 }

 // We create two identical kernels calling clock_block(), we create two so that
 // we can identify dependencies in the profile timeline ("kernel_B" is always
 // dependent on "kernel_A" in the same stream).
-__global__ void kernel_A(clock_t *d_o, clock_t clock_count) {
-  clock_block(d_o, clock_count);
-}
-__global__ void kernel_B(clock_t *d_o, clock_t clock_count) {
-  clock_block(d_o, clock_count);
-}
+__global__ void kernel_A(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
+__global__ void kernel_B(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }

 // Single-warp reduction kernel (note: this is not optimized for simplicity)
-__global__ void sum(clock_t *d_clocks, int N) {
-  // Handle to thread block group
-  cg::thread_block cta = cg::this_thread_block();
-  __shared__ clock_t s_clocks[32];
+__global__ void sum(clock_t *d_clocks, int N)
+{
+    // Handle to thread block group
+    cg::thread_block   cta = cg::this_thread_block();
+    __shared__ clock_t s_clocks[32];

-  clock_t my_sum = 0;
+    clock_t my_sum = 0;

-  for (int i = threadIdx.x; i < N; i += blockDim.x) {
-    my_sum += d_clocks[i];
-  }
-
-  s_clocks[threadIdx.x] = my_sum;
-  cg::sync(cta);
-
-  for (int i = warpSize / 2; i > 0; i /= 2) {
-    if (threadIdx.x < i) {
-      s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
+        my_sum += d_clocks[i];
    }

+    s_clocks[threadIdx.x] = my_sum;
    cg::sync(cta);
-  }

-  if (threadIdx.x == 0) {
-    d_clocks[0] = s_clocks[0];
-  }
-}
+    for (int i = warpSize / 2; i > 0; i /= 2) {
+        if (threadIdx.x < i) {
+            s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
+        }

-int main(int argc, char **argv) {
-  int nstreams = 32;       // One stream for each pair of kernels
-  float kernel_time = 10;  // Time each kernel should run in ms
-  float elapsed_time;
-  int cuda_device = 0;
-
-  printf("starting %s...\n", sSDKsample);
-
-  // Get number of streams (if overridden on the command line)
-  if (checkCmdLineFlag(argc, (const char **)argv, "nstreams")) {
-    nstreams = getCmdLineArgumentInt(argc, (const char **)argv, "nstreams");
-  }
-
-  // Use command-line specified CUDA device, otherwise use device with
-  // highest Gflops/s
-  cuda_device = findCudaDevice(argc, (const char **)argv);
-
-  // Get device properties
-  cudaDeviceProp deviceProp;
-  checkCudaErrors(cudaGetDevice(&cuda_device));
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
-
-  // HyperQ is available in devices of Compute Capability 3.5 and higher
-  if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
-    if (deviceProp.concurrentKernels == 0) {
-      printf(
-          "> GPU does not support concurrent kernel execution (SM 3.5 or "
-          "higher required)\n");
-      printf("  CUDA kernel runs will be serialized\n");
-    } else {
-      printf("> GPU does not support HyperQ\n");
-      printf("  CUDA kernel runs will have limited concurrency\n");
+        cg::sync(cta);
    }
-  }

-  printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
-         deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
-
-  // Allocate host memory for the output (reduced to a single value)
-  clock_t *a = 0;
-  checkCudaErrors(cudaMallocHost((void **)&a, sizeof(clock_t)));
-
-  // Allocate device memory for the output (one value for each kernel)
-  clock_t *d_a = 0;
-  checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t)));
-
-  // Allocate and initialize an array of stream handles
-  cudaStream_t *streams =
-      (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
-
-  for (int i = 0; i < nstreams; i++) {
-    checkCudaErrors(cudaStreamCreate(&(streams[i])));
-  }
-
-  // Create CUDA event handles
-  cudaEvent_t start_event, stop_event;
-  checkCudaErrors(cudaEventCreate(&start_event));
-  checkCudaErrors(cudaEventCreate(&stop_event));
-
-  // Target time per kernel is kernel_time ms, clockRate is in KHz
-  // Target number of clocks = target time * clock frequency
-#if defined(__arm__) || defined(__aarch64__)
-  // the kernel takes more time than the channel reset time on arm archs, so to
-  // prevent hangs reduce time_clocks.
-  clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100));
-#else
-  clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
-#endif
-  clock_t total_clocks = 0;
-
-  // Start the clock
-  checkCudaErrors(cudaEventRecord(start_event, 0));
-
-  // Queue pairs of {kernel_A, kernel_B} in separate streams
-  for (int i = 0; i < nstreams; ++i) {
-    kernel_A<<<1, 1, 0, streams[i]>>>(&d_a[2 * i], time_clocks);
-    total_clocks += time_clocks;
-    kernel_B<<<1, 1, 0, streams[i]>>>(&d_a[2 * i + 1], time_clocks);
-    total_clocks += time_clocks;
-  }
-
-  // Stop the clock in stream 0 (i.e. all previous kernels will be complete)
-  checkCudaErrors(cudaEventRecord(stop_event, 0));
-
-  // At this point the CPU has dispatched all work for the GPU and can
-  // continue processing other tasks in parallel. In this sample we just want
-  // to wait until all work is done so we use a blocking cudaMemcpy below.
-
-  // Run the sum kernel and copy the result back to host
-  sum<<<1, 32>>>(d_a, 2 * nstreams);
-  checkCudaErrors(cudaMemcpy(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost));
-
-  // stop_event will have been recorded but including the synchronize here to
-  // prevent copy/paste errors!
-  checkCudaErrors(cudaEventSynchronize(stop_event));
-  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
-
-  printf(
-      "Expected time for serial execution of %d sets of kernels is between "
-      "approx. %.3fs and %.3fs\n",
-      nstreams, (nstreams + 1) * kernel_time / 1000.0f,
-      2 * nstreams * kernel_time / 1000.0f);
-  printf(
-      "Expected time for fully concurrent execution of %d sets of kernels is "
-      "approx. %.3fs\n",
-      nstreams, 2 * kernel_time / 1000.0f);
-  printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
-
-  bool bTestResult = (a[0] >= total_clocks);
-
-  // Release resources
-  for (int i = 0; i < nstreams; i++) {
-    cudaStreamDestroy(streams[i]);
-  }
-
-  free(streams);
-  cudaEventDestroy(start_event);
-  cudaEventDestroy(stop_event);
-  cudaFreeHost(a);
-  cudaFree(d_a);
-
-  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    if (threadIdx.x == 0) {
+        d_clocks[0] = s_clocks[0];
+    }
+}
+
+int main(int argc, char **argv)
+{
+    int   nstreams    = 32; // One stream for each pair of kernels
+    float kernel_time = 10; // Time each kernel should run in ms
+    float elapsed_time;
+    int   cuda_device = 0;
+
+    printf("starting %s...\n", sSDKsample);
+
+    // Get number of streams (if overridden on the command line)
+    if (checkCmdLineFlag(argc, (const char **)argv, "nstreams")) {
+        nstreams = getCmdLineArgumentInt(argc, (const char **)argv, "nstreams");
+    }
+
+    // Use command-line specified CUDA device, otherwise use device with
+    // highest Gflops/s
+    cuda_device = findCudaDevice(argc, (const char **)argv);
+
+    // Get device properties
+    cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDevice(&cuda_device));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
+
+    // HyperQ is available in devices of Compute Capability 3.5 and higher
+    if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
+        if (deviceProp.concurrentKernels == 0) {
+            printf("> GPU does not support concurrent kernel execution (SM 3.5 or "
+                   "higher required)\n");
+            printf("  CUDA kernel runs will be serialized\n");
+        }
+        else {
+            printf("> GPU does not support HyperQ\n");
+            printf("  CUDA kernel runs will have limited concurrency\n");
+        }
+    }
+
+    printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
+           deviceProp.major,
+           deviceProp.minor,
+           deviceProp.multiProcessorCount);
+
+    // Allocate host memory for the output (reduced to a single value)
+    clock_t *a = 0;
+    checkCudaErrors(cudaMallocHost((void **)&a, sizeof(clock_t)));
+
+    // Allocate device memory for the output (one value for each kernel)
+    clock_t *d_a = 0;
+    checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t)));
+
+    // Allocate and initialize an array of stream handles
+    cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
+
+    for (int i = 0; i < nstreams; i++) {
+        checkCudaErrors(cudaStreamCreate(&(streams[i])));
+    }
+
+    // Create CUDA event handles
+    cudaEvent_t start_event, stop_event;
+    checkCudaErrors(cudaEventCreate(&start_event));
+    checkCudaErrors(cudaEventCreate(&stop_event));
+
+    // Target time per kernel is kernel_time ms, clockRate is in KHz
+    // Target number of clocks = target time * clock frequency
+#if defined(__arm__) || defined(__aarch64__)
+    // the kernel takes more time than the channel reset time on arm archs, so to
+    // prevent hangs reduce time_clocks.
+    clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100));
+#else
+    clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
+#endif
+    clock_t total_clocks = 0;
+
+    // Start the clock
+    checkCudaErrors(cudaEventRecord(start_event, 0));
+
+    // Queue pairs of {kernel_A, kernel_B} in separate streams
+    for (int i = 0; i < nstreams; ++i) {
+        kernel_A<<<1, 1, 0, streams[i]>>>(&d_a[2 * i], time_clocks);
+        total_clocks += time_clocks;
+        kernel_B<<<1, 1, 0, streams[i]>>>(&d_a[2 * i + 1], time_clocks);
+        total_clocks += time_clocks;
+    }
+
+    // Stop the clock in stream 0 (i.e. all previous kernels will be complete)
+    checkCudaErrors(cudaEventRecord(stop_event, 0));
+
+    // At this point the CPU has dispatched all work for the GPU and can
+    // continue processing other tasks in parallel. In this sample we just want
+    // to wait until all work is done so we use a blocking cudaMemcpy below.
+
+    // Run the sum kernel and copy the result back to host
+    sum<<<1, 32>>>(d_a, 2 * nstreams);
+    checkCudaErrors(cudaMemcpy(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost));
+
+    // stop_event will have been recorded but including the synchronize here to
+    // prevent copy/paste errors!
+    checkCudaErrors(cudaEventSynchronize(stop_event));
+    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
+
+    printf("Expected time for serial execution of %d sets of kernels is between "
+           "approx. %.3fs and %.3fs\n",
+           nstreams,
+           (nstreams + 1) * kernel_time / 1000.0f,
+           2 * nstreams * kernel_time / 1000.0f);
+    printf("Expected time for fully concurrent execution of %d sets of kernels is "
+           "approx. %.3fs\n",
+           nstreams,
+           2 * kernel_time / 1000.0f);
+    printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
+
+    bool bTestResult = (a[0] >= total_clocks);
+
+    // Release resources
+    for (int i = 0; i < nstreams; i++) {
+        cudaStreamDestroy(streams[i]);
+    }
+
+    free(streams);
+    cudaEventDestroy(start_event);
+    cudaEventDestroy(stop_event);
+    cudaFreeHost(a);
+    cudaFree(d_a);
+
+    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/simpleIPC/README.md
+++ b/Samples/0_Introduction/simpleIPC/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
-
--- a/Samples/0_Introduction/simpleIPC/simpleIPC.cu
+++ b/Samples/0_Introduction/simpleIPC/simpleIPC.cu
@ -32,6 +32,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <vector>
+
 #include "helper_cuda.h"
 #include "helper_multiprocess.h"
 static const char shmName[] = "simpleIPCshm";
@ -39,7 +40,7 @@ static const char shmName[] = "simpleIPCshm";
 // For NVSWITCH connected peers like DGX-2, simultaneous peers are not limited
 // in the same way.
 #define MAX_DEVICES (32)
-#define DATA_SIZE (64ULL << 20ULL)  // 64MB
+#define DATA_SIZE   (64ULL << 20ULL) // 64MB

 #if defined(__linux__)
 #define cpu_atomic_add32(a, x) __sync_add_and_fetch(a, x)
@ -49,281 +50,280 @@ static const char shmName[] = "simpleIPCshm";
 #error Unsupported system
 #endif

-typedef struct shmStruct_st {
-  size_t nprocesses;
-  int barrier;
-  int sense;
-  int devices[MAX_DEVICES];
-  cudaIpcMemHandle_t memHandle[MAX_DEVICES];
-  cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
+typedef struct shmStruct_st
+{
+    size_t               nprocesses;
+    int                  barrier;
+    int                  sense;
+    int                  devices[MAX_DEVICES];
+    cudaIpcMemHandle_t   memHandle[MAX_DEVICES];
+    cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
 } shmStruct;

-__global__ void simpleKernel(char *ptr, int sz, char val) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
-    ptr[idx] = val;
-  }
+__global__ void simpleKernel(char *ptr, int sz, char val)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
+        ptr[idx] = val;
+    }
 }

-static void barrierWait(volatile int *barrier, volatile int *sense,
-                        unsigned int n) {
-  int count;
+static void barrierWait(volatile int *barrier, volatile int *sense, unsigned int n)
+{
+    int count;

-  // Check-in
-  count = cpu_atomic_add32(barrier, 1);
-  if (count == n)  // Last one in
-    *sense = 1;
-  while (!*sense)
-    ;
+    // Check-in
+    count = cpu_atomic_add32(barrier, 1);
+    if (count == n) // Last one in
+        *sense = 1;
+    while (!*sense)
+        ;

-  // Check-out
-  count = cpu_atomic_add32(barrier, -1);
-  if (count == 0)  // Last one out
-    *sense = 0;
-  while (*sense)
-    ;
+    // Check-out
+    count = cpu_atomic_add32(barrier, -1);
+    if (count == 0) // Last one out
+        *sense = 0;
+    while (*sense)
+        ;
 }

-static void childProcess(int id) {
-  volatile shmStruct *shm = NULL;
-  cudaStream_t stream;
-  sharedMemoryInfo info;
-  size_t procCount, i;
-  int blocks = 0;
-  int threads = 128;
-  cudaDeviceProp prop;
-  std::vector<void *> ptrs;
-  std::vector<cudaEvent_t> events;
-  std::vector<char> verification_buffer(DATA_SIZE);
+static void childProcess(int id)
+{
+    volatile shmStruct      *shm = NULL;
+    cudaStream_t             stream;
+    sharedMemoryInfo         info;
+    size_t                   procCount, i;
+    int                      blocks  = 0;
+    int                      threads = 128;
+    cudaDeviceProp           prop;
+    std::vector<void *>      ptrs;
+    std::vector<cudaEvent_t> events;
+    std::vector<char>        verification_buffer(DATA_SIZE);

-  if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) {
-    printf("Failed to create shared memory slab\n");
-    exit(EXIT_FAILURE);
-  }
-  shm = (volatile shmStruct *)info.addr;
-  procCount = shm->nprocesses;
-
-  printf("Process %d: Starting on device %d...\n", id, shm->devices[id]);
-
-  checkCudaErrors(cudaSetDevice(shm->devices[id]));
-  checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
-  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &blocks, simpleKernel, threads, 0));
-  blocks *= prop.multiProcessorCount;
-
-  // Open and track all the allocations and events created in the master
-  // process for use later
-  for (i = 0; i < procCount; i++) {
-    void *ptr = NULL;
-    cudaEvent_t event;
-
-    // Notice, we don't need to explicitly enable peer access for
-    // allocations on other devices.
-    checkCudaErrors(
-        cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i],
-                             cudaIpcMemLazyEnablePeerAccess));
-    checkCudaErrors(cudaIpcOpenEventHandle(
-        &event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
-
-    ptrs.push_back(ptr);
-    events.push_back(event);
-  }
-
-  // At each iteration of the loop, each sibling process will push work on
-  // their respective devices accessing the next peer mapped buffer allocated
-  // by the master process (these can come from other sibling processes as
-  // well). To coordinate each process' access, we force the stream to wait for
-  // the work already accessing this buffer asynchronously through IPC events,
-  // allowing the CPU processes to continue to queue more work.
-  for (i = 0; i < procCount; i++) {
-    size_t bufferId = (i + id) % procCount;
-    // Wait for the buffer to be accessed to be ready
-    checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
-    // Push a simple kernel on it
-    simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId],
-                                                 DATA_SIZE, id);
-    checkCudaErrors(cudaGetLastError());
-    // Signal that this buffer is ready for the next consumer
-    checkCudaErrors(cudaEventRecord(events[bufferId], stream));
-    // Wait for all my sibling processes to push this stage of their work
-    // before proceeding to the next. This prevents siblings from racing
-    // ahead and clobbering the recorded event or waiting on the wrong
-    // recorded event.
-    barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
-    if (id == 0) {
-      printf("Step %lld done\n", (unsigned long long)i);
+    if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) {
+        printf("Failed to create shared memory slab\n");
+        exit(EXIT_FAILURE);
    }
-  }
+    shm       = (volatile shmStruct *)info.addr;
+    procCount = shm->nprocesses;

-  // Now wait for my buffer to be ready so I can copy it locally and verify it
-  checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
-  checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE,
-                                  cudaMemcpyDeviceToHost, stream));
-  // And wait for all the queued up work to complete
-  checkCudaErrors(cudaStreamSynchronize(stream));
+    printf("Process %d: Starting on device %d...\n", id, shm->devices[id]);

-  printf("Process %d: verifying...\n", id);
+    checkCudaErrors(cudaSetDevice(shm->devices[id]));
+    checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
+    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks, simpleKernel, threads, 0));
+    blocks *= prop.multiProcessorCount;

-  // The contents should have the id of the sibling just after me
-  char compareId = (char)((id + 1) % procCount);
-  for (unsigned long long j = 0; j < DATA_SIZE; j++) {
-    if (verification_buffer[j] != compareId) {
-      printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j,
-             (int)verification_buffer[j], (int)compareId);
+    // Open and track all the allocations and events created in the master
+    // process for use later
+    for (i = 0; i < procCount; i++) {
+        void       *ptr = NULL;
+        cudaEvent_t event;
+
+        // Notice, we don't need to explicitly enable peer access for
+        // allocations on other devices.
+        checkCudaErrors(
+            cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i], cudaIpcMemLazyEnablePeerAccess));
+        checkCudaErrors(cudaIpcOpenEventHandle(&event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
+
+        ptrs.push_back(ptr);
+        events.push_back(event);
    }
-  }

-  // Clean up!
-  for (i = 0; i < procCount; i++) {
-    checkCudaErrors(cudaIpcCloseMemHandle(ptrs[i]));
-    checkCudaErrors(cudaEventDestroy(events[i]));
-  }
+    // At each iteration of the loop, each sibling process will push work on
+    // their respective devices accessing the next peer mapped buffer allocated
+    // by the master process (these can come from other sibling processes as
+    // well). To coordinate each process' access, we force the stream to wait for
+    // the work already accessing this buffer asynchronously through IPC events,
+    // allowing the CPU processes to continue to queue more work.
+    for (i = 0; i < procCount; i++) {
+        size_t bufferId = (i + id) % procCount;
+        // Wait for the buffer to be accessed to be ready
+        checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
+        // Push a simple kernel on it
+        simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId], DATA_SIZE, id);
+        checkCudaErrors(cudaGetLastError());
+        // Signal that this buffer is ready for the next consumer
+        checkCudaErrors(cudaEventRecord(events[bufferId], stream));
+        // Wait for all my sibling processes to push this stage of their work
+        // before proceeding to the next. This prevents siblings from racing
+        // ahead and clobbering the recorded event or waiting on the wrong
+        // recorded event.
+        barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
+        if (id == 0) {
+            printf("Step %lld done\n", (unsigned long long)i);
+        }
+    }

-  checkCudaErrors(cudaStreamDestroy(stream));
+    // Now wait for my buffer to be ready so I can copy it locally and verify it
+    checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
+    checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, cudaMemcpyDeviceToHost, stream));
+    // And wait for all the queued up work to complete
+    checkCudaErrors(cudaStreamSynchronize(stream));

-  printf("Process %d complete!\n", id);
+    printf("Process %d: verifying...\n", id);
+
+    // The contents should have the id of the sibling just after me
+    char compareId = (char)((id + 1) % procCount);
+    for (unsigned long long j = 0; j < DATA_SIZE; j++) {
+        if (verification_buffer[j] != compareId) {
+            printf("Process %d: Verification mismatch at %lld: %d != %d\n",
+                   id,
+                   j,
+                   (int)verification_buffer[j],
+                   (int)compareId);
+        }
+    }
+
+    // Clean up!
+    for (i = 0; i < procCount; i++) {
+        checkCudaErrors(cudaIpcCloseMemHandle(ptrs[i]));
+        checkCudaErrors(cudaEventDestroy(events[i]));
+    }
+
+    checkCudaErrors(cudaStreamDestroy(stream));
+
+    printf("Process %d complete!\n", id);
 }

-static void parentProcess(char *app) {
-  sharedMemoryInfo info;
-  int devCount, i;
-  volatile shmStruct *shm = NULL;
-  std::vector<void *> ptrs;
-  std::vector<cudaEvent_t> events;
-  std::vector<Process> processes;
+static void parentProcess(char *app)
+{
+    sharedMemoryInfo         info;
+    int                      devCount, i;
+    volatile shmStruct      *shm = NULL;
+    std::vector<void *>      ptrs;
+    std::vector<cudaEvent_t> events;
+    std::vector<Process>     processes;

-  checkCudaErrors(cudaGetDeviceCount(&devCount));
+    checkCudaErrors(cudaGetDeviceCount(&devCount));

-  if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) {
-    printf("Failed to create shared memory slab\n");
-    exit(EXIT_FAILURE);
-  }
-  shm = (volatile shmStruct *)info.addr;
-  memset((void *)shm, 0, sizeof(*shm));
-
-  // Pick all the devices that can access each other's memory for this test
-  // Keep in mind that CUDA has minimal support for fork() without a
-  // corresponding exec() in the child process, but in this case our
-  // spawnProcess will always exec, so no need to worry.
-  for (i = 0; i < devCount; i++) {
-    bool allPeers = true;
-    cudaDeviceProp prop;
-    checkCudaErrors(cudaGetDeviceProperties(&prop, i));
-
-    // CUDA IPC is only supported on devices with unified addressing
-    if (!prop.unifiedAddressing) {
-      printf("Device %d does not support unified addressing, skipping...\n", i);
-      continue;
+    if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) {
+        printf("Failed to create shared memory slab\n");
+        exit(EXIT_FAILURE);
    }
-    // This sample requires two processes accessing each device, so we need
-    // to ensure exclusive or prohibited mode is not set
-    if (prop.computeMode != cudaComputeModeDefault) {
-      printf("Device %d is in an unsupported compute mode for this sample\n",
-             i);
-      continue;
+    shm = (volatile shmStruct *)info.addr;
+    memset((void *)shm, 0, sizeof(*shm));
+
+    // Pick all the devices that can access each other's memory for this test
+    // Keep in mind that CUDA has minimal support for fork() without a
+    // corresponding exec() in the child process, but in this case our
+    // spawnProcess will always exec, so no need to worry.
+    for (i = 0; i < devCount; i++) {
+        bool           allPeers = true;
+        cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, i));
+
+        // CUDA IPC is only supported on devices with unified addressing
+        if (!prop.unifiedAddressing) {
+            printf("Device %d does not support unified addressing, skipping...\n", i);
+            continue;
+        }
+        // This sample requires two processes accessing each device, so we need
+        // to ensure exclusive or prohibited mode is not set
+        if (prop.computeMode != cudaComputeModeDefault) {
+            printf("Device %d is in an unsupported compute mode for this sample\n", i);
+            continue;
+        }
+
+        for (int j = 0; j < shm->nprocesses; j++) {
+            int canAccessPeerIJ, canAccessPeerJI;
+            checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
+            checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
+            if (!canAccessPeerIJ || !canAccessPeerJI) {
+                allPeers = false;
+                break;
+            }
+        }
+        if (allPeers) {
+            // Enable peers here.  This isn't necessary for IPC, but it will
+            // setup the peers for the device.  For systems that only allow 8
+            // peers per GPU at a time, this acts to remove devices from CanAccessPeer
+            for (int j = 0; j < shm->nprocesses; j++) {
+                checkCudaErrors(cudaSetDevice(i));
+                checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0));
+                checkCudaErrors(cudaSetDevice(shm->devices[j]));
+                checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
+            }
+            shm->devices[shm->nprocesses++] = i;
+            if (shm->nprocesses >= MAX_DEVICES)
+                break;
+        }
+        else {
+            printf("Device %d is not peer capable with some other selected peers, "
+                   "skipping\n",
+                   i);
+        }
    }

-    for (int j = 0; j < shm->nprocesses; j++) {
-      int canAccessPeerIJ, canAccessPeerJI;
-      checkCudaErrors(
-          cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
-      checkCudaErrors(
-          cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
-      if (!canAccessPeerIJ || !canAccessPeerJI) {
-        allPeers = false;
-        break;
-      }
-    }
-    if (allPeers) {
-      // Enable peers here.  This isn't necessary for IPC, but it will
-      // setup the peers for the device.  For systems that only allow 8
-      // peers per GPU at a time, this acts to remove devices from CanAccessPeer
-      for (int j = 0; j < shm->nprocesses; j++) {
-        checkCudaErrors(cudaSetDevice(i));
-        checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0));
-        checkCudaErrors(cudaSetDevice(shm->devices[j]));
-        checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
-      }
-      shm->devices[shm->nprocesses++] = i;
-      if (shm->nprocesses >= MAX_DEVICES) break;
-    } else {
-      printf(
-          "Device %d is not peer capable with some other selected peers, "
-          "skipping\n",
-          i);
-    }
-  }
-
-  if (shm->nprocesses == 0) {
-    printf("No CUDA devices support IPC\n");
-    exit(EXIT_WAIVED);
-  }
-
-  // Now allocate memory and an event for each process and fill the shared
-  // memory buffer with the IPC handles to communicate
-  for (i = 0; i < shm->nprocesses; i++) {
-    void *ptr = NULL;
-    cudaEvent_t event;
-
-    checkCudaErrors(cudaSetDevice(shm->devices[i]));
-    checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
-    checkCudaErrors(
-        cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
-    checkCudaErrors(cudaEventCreate(
-        &event, cudaEventDisableTiming | cudaEventInterprocess));
-    checkCudaErrors(cudaIpcGetEventHandle(
-        (cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
-
-    ptrs.push_back(ptr);
-    events.push_back(event);
-  }
-
-  // Launch the child processes!
-  for (i = 0; i < shm->nprocesses; i++) {
-    char devIdx[12];  // Increased size to ensure enough space for formatted integer
-    char *const args[] = {app, devIdx, NULL};
-    Process process;
-
-    snprintf(devIdx, sizeof(devIdx), "%d", i);
-
-    if (spawnProcess(&process, app, args)) {
-      printf("Failed to create process\n");
-      exit(EXIT_FAILURE);
+    if (shm->nprocesses == 0) {
+        printf("No CUDA devices support IPC\n");
+        exit(EXIT_WAIVED);
    }

-    processes.push_back(process);
-  }
+    // Now allocate memory and an event for each process and fill the shared
+    // memory buffer with the IPC handles to communicate
+    for (i = 0; i < shm->nprocesses; i++) {
+        void       *ptr = NULL;
+        cudaEvent_t event;

-  // And wait for them to finish
-  for (i = 0; i < processes.size(); i++) {
-    if (waitProcess(&processes[i]) != EXIT_SUCCESS) {
-      printf("Process %d failed!\n", i);
-      exit(EXIT_FAILURE);
+        checkCudaErrors(cudaSetDevice(shm->devices[i]));
+        checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
+        checkCudaErrors(cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
+        checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming | cudaEventInterprocess));
+        checkCudaErrors(cudaIpcGetEventHandle((cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
+
+        ptrs.push_back(ptr);
+        events.push_back(event);
    }
-  }

-  // Clean up!
-  for (i = 0; i < shm->nprocesses; i++) {
-    checkCudaErrors(cudaSetDevice(shm->devices[i]));
-    checkCudaErrors(cudaEventSynchronize(events[i]));
-    checkCudaErrors(cudaEventDestroy(events[i]));
-    checkCudaErrors(cudaFree(ptrs[i]));
-  }
+    // Launch the child processes!
+    for (i = 0; i < shm->nprocesses; i++) {
+        char        devIdx[12]; // Increased size to ensure enough space for formatted integer
+        char *const args[] = {app, devIdx, NULL};
+        Process     process;

-  sharedMemoryClose(&info);
+        snprintf(devIdx, sizeof(devIdx), "%d", i);
+
+        if (spawnProcess(&process, app, args)) {
+            printf("Failed to create process\n");
+            exit(EXIT_FAILURE);
+        }
+
+        processes.push_back(process);
+    }
+
+    // And wait for them to finish
+    for (i = 0; i < processes.size(); i++) {
+        if (waitProcess(&processes[i]) != EXIT_SUCCESS) {
+            printf("Process %d failed!\n", i);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    // Clean up!
+    for (i = 0; i < shm->nprocesses; i++) {
+        checkCudaErrors(cudaSetDevice(shm->devices[i]));
+        checkCudaErrors(cudaEventSynchronize(events[i]));
+        checkCudaErrors(cudaEventDestroy(events[i]));
+        checkCudaErrors(cudaFree(ptrs[i]));
+    }
+
+    sharedMemoryClose(&info);
 }

-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 #if defined(__arm__) || defined(__aarch64__)
-  printf("Not supported on ARM\n");
-  return EXIT_WAIVED;
+    printf("Not supported on ARM\n");
+    return EXIT_WAIVED;
 #else
-  if (argc == 1) {
-    parentProcess(argv[0]);
-  } else {
-    childProcess(atoi(argv[1]));
-  }
-  return EXIT_SUCCESS;
+    if (argc == 1) {
+        parentProcess(argv[0]);
+    }
+    else {
+        childProcess(atoi(argv[1]));
+    }
+    return EXIT_SUCCESS;
 #endif
 }
--- a/Samples/0_Introduction/simpleLayeredTexture/simpleLayeredTexture.cu
+++ b/Samples/0_Introduction/simpleLayeredTexture/simpleLayeredTexture.cu
@ -26,27 +26,27 @@
 */

 /*
-* This sample demonstrates how to use texture fetches from layered 2D textures
-* in CUDA C
-*
-* This sample first generates a 3D input data array for the layered texture
-* and the expected output. Then it starts CUDA C kernels, one for each layer,
-* which fetch their layer's texture data (using normalized texture coordinates)
-* transform it to the expected output, and write it to a 3D output data array.
-*/
+ * This sample demonstrates how to use texture fetches from layered 2D textures
+ * in CUDA C
+ *
+ * This sample first generates a 3D input data array for the layered texture
+ * and the expected output. Then it starts CUDA C kernels, one for each layer,
+ * which fetch their layer's texture data (using normalized texture coordinates)
+ * transform it to the expected output, and write it to a 3D output data array.
+ */

 // includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 // includes, kernels
 #include <cuda_runtime.h>

 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
+#include <helper_functions.h> // helper for shared that are common to CUDA Samples

 static const char *sSDKname = "simpleLayeredTexture";

@ -54,163 +54,156 @@ static const char *sSDKname = "simpleLayeredTexture";
 //! Transform a layer of a layered 2D texture using texture lookups
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *g_odata, int width, int height,
-                                int layer, cudaTextureObject_t tex) {
-  // calculate this thread's data point
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void transformKernel(float *g_odata, int width, int height, int layer, cudaTextureObject_t tex)
+{
+    // calculate this thread's data point
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

-  // 0.5f offset and division are necessary to access the original data points
-  // in the texture (such that bilinear interpolation will not be activated).
-  // For details, see also CUDA Programming Guide, Appendix D
-  float u = (x + 0.5f) / (float)width;
-  float v = (y + 0.5f) / (float)height;
+    // 0.5f offset and division are necessary to access the original data points
+    // in the texture (such that bilinear interpolation will not be activated).
+    // For details, see also CUDA Programming Guide, Appendix D
+    float u = (x + 0.5f) / (float)width;
+    float v = (y + 0.5f) / (float)height;

-  // read from texture, do expected transformation and write to global memory
-  g_odata[layer * width * height + y * width + x] =
-      -tex2DLayered<float>(tex, u, v, layer) + layer;
+    // read from texture, do expected transformation and write to global memory
+    g_odata[layer * width * height + y * width + x] = -tex2DLayered<float>(tex, u, v, layer) + layer;
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  printf("[%s] - Starting...\n", sSDKname);
+int main(int argc, char **argv)
+{
+    printf("[%s] - Starting...\n", sSDKname);

-  // use command-line specified CUDA device, otherwise use device with highest
-  // Gflops/s
-  int devID = findCudaDevice(argc, (const char **)argv);
+    // use command-line specified CUDA device, otherwise use device with highest
+    // Gflops/s
+    int devID = findCudaDevice(argc, (const char **)argv);

-  bool bResult = true;
+    bool bResult = true;

-  // get number of SMs on this GPU
-  cudaDeviceProp deviceProps;
+    // get number of SMs on this GPU
+    cudaDeviceProp deviceProps;

-  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-  printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
-         deviceProps.multiProcessorCount);
-  printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
+    printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);

-  // generate input data for layered texture
-  unsigned int width = 512, height = 512, num_layers = 5;
-  unsigned int size = width * height * num_layers * sizeof(float);
-  float *h_data = (float *)malloc(size);
+    // generate input data for layered texture
+    unsigned int width = 512, height = 512, num_layers = 5;
+    unsigned int size   = width * height * num_layers * sizeof(float);
+    float       *h_data = (float *)malloc(size);

-  for (unsigned int layer = 0; layer < num_layers; layer++)
-    for (int i = 0; i < (int)(width * height); i++) {
-      h_data[layer * width * height + i] = (float)i;
+    for (unsigned int layer = 0; layer < num_layers; layer++)
+        for (int i = 0; i < (int)(width * height); i++) {
+            h_data[layer * width * height + i] = (float)i;
+        }
+
+    // this is the expected transformation of the input data (the expected output)
+    float *h_data_ref = (float *)malloc(size);
+
+    for (unsigned int layer = 0; layer < num_layers; layer++)
+        for (int i = 0; i < (int)(width * height); i++) {
+            h_data_ref[layer * width * height + i] = -h_data[layer * width * height + i] + layer;
+        }
+
+    // allocate device memory for result
+    float *d_data = NULL;
+    checkCudaErrors(cudaMalloc((void **)&d_data, size));
+
+    // allocate array and copy image data
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+    cudaArray            *cu_3darray;
+    checkCudaErrors(
+        cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
+    cudaMemcpy3DParms myparms = {0};
+    myparms.srcPos            = make_cudaPos(0, 0, 0);
+    myparms.dstPos            = make_cudaPos(0, 0, 0);
+    myparms.srcPtr            = make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
+    myparms.dstArray          = cu_3darray;
+    myparms.extent            = make_cudaExtent(width, height, num_layers);
+    myparms.kind              = cudaMemcpyHostToDevice;
+    checkCudaErrors(cudaMemcpy3D(&myparms));
+
+    cudaTextureObject_t tex;
+    cudaResourceDesc    texRes;
+    memset(&texRes, 0, sizeof(cudaResourceDesc));
+
+    texRes.resType         = cudaResourceTypeArray;
+    texRes.res.array.array = cu_3darray;
+
+    cudaTextureDesc texDescr;
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));
+
+    texDescr.normalizedCoords = true;
+    texDescr.filterMode       = cudaFilterModeLinear;
+    texDescr.addressMode[0]   = cudaAddressModeWrap;
+    texDescr.addressMode[1]   = cudaAddressModeWrap;
+    texDescr.readMode         = cudaReadModeElementType;
+
+    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
+
+    dim3 dimBlock(8, 8, 1);
+    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
+
+    printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
+           "8 x 8 threads\n",
+           width,
+           height,
+           dimGrid.x,
+           dimGrid.y);
+
+    transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0,
+                                           tex); // warmup (for better timing)
+
+    // check if kernel execution generated an error
+    getLastCudaError("warmup Kernel execution failed");
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    StopWatchInterface *timer = NULL;
+    sdkCreateTimer(&timer);
+    sdkStartTimer(&timer);
+
+    // execute the kernel
+    for (unsigned int layer = 0; layer < num_layers; layer++)
+        transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer, tex);
+
+    // check if kernel execution generated an error
+    getLastCudaError("Kernel execution failed");
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    sdkStopTimer(&timer);
+    printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
+    printf("%.2f Mtexlookups/sec\n", (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
+    sdkDeleteTimer(&timer);
+
+    // allocate mem for the result on host side
+    float *h_odata = (float *)malloc(size);
+    // copy result from device to host
+    checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
+
+    // write regression file if necessary
+    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
+        // write file for regression test
+        sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
    }
-
-  // this is the expected transformation of the input data (the expected output)
-  float *h_data_ref = (float *)malloc(size);
-
-  for (unsigned int layer = 0; layer < num_layers; layer++)
-    for (int i = 0; i < (int)(width * height); i++) {
-      h_data_ref[layer * width * height + i] =
-          -h_data[layer * width * height + i] + layer;
-    }
-
-  // allocate device memory for result
-  float *d_data = NULL;
-  checkCudaErrors(cudaMalloc((void **)&d_data, size));
-
-  // allocate array and copy image data
-  cudaChannelFormatDesc channelDesc =
-      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
-  cudaArray *cu_3darray;
-  checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
-                                    make_cudaExtent(width, height, num_layers),
-                                    cudaArrayLayered));
-  cudaMemcpy3DParms myparms = {0};
-  myparms.srcPos = make_cudaPos(0, 0, 0);
-  myparms.dstPos = make_cudaPos(0, 0, 0);
-  myparms.srcPtr =
-      make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
-  myparms.dstArray = cu_3darray;
-  myparms.extent = make_cudaExtent(width, height, num_layers);
-  myparms.kind = cudaMemcpyHostToDevice;
-  checkCudaErrors(cudaMemcpy3D(&myparms));
-
-  cudaTextureObject_t tex;
-  cudaResourceDesc texRes;
-  memset(&texRes, 0, sizeof(cudaResourceDesc));
-
-  texRes.resType = cudaResourceTypeArray;
-  texRes.res.array.array = cu_3darray;
-
-  cudaTextureDesc texDescr;
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
-
-  texDescr.normalizedCoords = true;
-  texDescr.filterMode = cudaFilterModeLinear;
-  texDescr.addressMode[0] = cudaAddressModeWrap;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
-  texDescr.readMode = cudaReadModeElementType;
-
-  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
-
-  dim3 dimBlock(8, 8, 1);
-  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
-
-  printf(
-      "Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
-      "8 x 8 threads\n",
-      width, height, dimGrid.x, dimGrid.y);
-
-  transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0,
-                                         tex);  // warmup (for better timing)
-
-  // check if kernel execution generated an error
-  getLastCudaError("warmup Kernel execution failed");
-
-  checkCudaErrors(cudaDeviceSynchronize());
-
-  StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
-
-  // execute the kernel
-  for (unsigned int layer = 0; layer < num_layers; layer++)
-    transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer,
-                                              tex);
-
-  // check if kernel execution generated an error
-  getLastCudaError("Kernel execution failed");
-
-  checkCudaErrors(cudaDeviceSynchronize());
-  sdkStopTimer(&timer);
-  printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
-  printf("%.2f Mtexlookups/sec\n",
-         (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) /
-          1e6));
-  sdkDeleteTimer(&timer);
-
-  // allocate mem for the result on host side
-  float *h_odata = (float *)malloc(size);
-  // copy result from device to host
-  checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
-
-  // write regression file if necessary
-  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
-    // write file for regression test
-    sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f,
-                        false);
-  } else {
-    printf("Comparing kernel output to expected data\n");
+    else {
+        printf("Comparing kernel output to expected data\n");

 #define MIN_EPSILON_ERROR 5e-3f
-    bResult = compareData(h_odata, h_data_ref, width * height * num_layers,
-                          MIN_EPSILON_ERROR, 0.0f);
-  }
+        bResult = compareData(h_odata, h_data_ref, width * height * num_layers, MIN_EPSILON_ERROR, 0.0f);
+    }

-  // cleanup memory
-  free(h_data);
-  free(h_data_ref);
-  free(h_odata);
+    // cleanup memory
+    free(h_data);
+    free(h_data_ref);
+    free(h_odata);

-  checkCudaErrors(cudaDestroyTextureObject(tex));
-  checkCudaErrors(cudaFree(d_data));
-  checkCudaErrors(cudaFreeArray(cu_3darray));
+    checkCudaErrors(cudaDestroyTextureObject(tex));
+    checkCudaErrors(cudaFree(d_data));
+    checkCudaErrors(cudaFreeArray(cu_3darray));

-  exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/simpleMPI/simpleMPI.cpp
+++ b/Samples/0_Introduction/simpleMPI/simpleMPI.cpp
@ -26,15 +26,15 @@
 */

 /* Simple example demonstrating how to use MPI with CUDA
-*
-*  Generate some random numbers on one node.
-*  Dispatch them to all nodes.
-*  Compute their square root on each node's GPU.
-*  Compute the average of the results using MPI.
-*
-*  simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms
-*                 on Windows, please download the Microsoft HPC Pack SDK 2008
-*/
+ *
+ *  Generate some random numbers on one node.
+ *  Dispatch them to all nodes.
+ *  Compute their square root on each node's GPU.
+ *  Compute the average of the results using MPI.
+ *
+ *  simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms
+ *                 on Windows, please download the Microsoft HPC Pack SDK 2008
+ */

 // MPI include
 #include <mpi.h>
@ -42,87 +42,88 @@
 // System includes
 #include <iostream>

-using std::cout;
 using std::cerr;
+using std::cout;
 using std::endl;

 // User include
 #include "simpleMPI.h"

 // Error handling macros
-#define MPI_CHECK(call)                          \
-  if ((call) != MPI_SUCCESS) {                   \
-    cerr << "MPI error calling \"" #call "\"\n"; \
-    my_abort(-1);                                \
-  }
+#define MPI_CHECK(call)                              \
+    if ((call) != MPI_SUCCESS) {                     \
+        cerr << "MPI error calling \"" #call "\"\n"; \
+        my_abort(-1);                                \
+    }

 // Host code
 // No CUDA here, only MPI
-int main(int argc, char *argv[]) {
-  // Dimensions of the dataset
-  int blockSize = 256;
-  int gridSize = 10000;
-  int dataSizePerNode = gridSize * blockSize;
+int main(int argc, char *argv[])
+{
+    // Dimensions of the dataset
+    int blockSize       = 256;
+    int gridSize        = 10000;
+    int dataSizePerNode = gridSize * blockSize;

-  // Initialize MPI state
-  MPI_CHECK(MPI_Init(&argc, &argv));
+    // Initialize MPI state
+    MPI_CHECK(MPI_Init(&argc, &argv));

-  // Get our MPI node number and node count
-  int commSize, commRank;
-  MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize));
-  MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank));
+    // Get our MPI node number and node count
+    int commSize, commRank;
+    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize));
+    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank));

-  // Generate some random numbers on the root node (node 0)
-  int dataSizeTotal = dataSizePerNode * commSize;
-  float *dataRoot = NULL;
+    // Generate some random numbers on the root node (node 0)
+    int    dataSizeTotal = dataSizePerNode * commSize;
+    float *dataRoot      = NULL;

-  // Are we the root node?
-  if (commRank == 0) {
-    cout << "Running on " << commSize << " nodes" << endl;
-    dataRoot = new float[dataSizeTotal];
-    initData(dataRoot, dataSizeTotal);
-  }
+    // Are we the root node?
+    if (commRank == 0) {
+        cout << "Running on " << commSize << " nodes" << endl;
+        dataRoot = new float[dataSizeTotal];
+        initData(dataRoot, dataSizeTotal);
+    }

-  // Allocate a buffer on each node
-  float *dataNode = new float[dataSizePerNode];
+    // Allocate a buffer on each node
+    float *dataNode = new float[dataSizePerNode];

-  // Dispatch a portion of the input data to each node
-  MPI_CHECK(MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode,
-                        dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD));
+    // Dispatch a portion of the input data to each node
+    MPI_CHECK(
+        MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD));

-  if (commRank == 0) {
-    // No need for root data any more
-    delete[] dataRoot;
-  }
+    if (commRank == 0) {
+        // No need for root data any more
+        delete[] dataRoot;
+    }

-  // On each node, run computation on GPU
-  computeGPU(dataNode, blockSize, gridSize);
+    // On each node, run computation on GPU
+    computeGPU(dataNode, blockSize, gridSize);

-  // Reduction to the root node, computing the sum of output elements
-  float sumNode = sum(dataNode, dataSizePerNode);
-  float sumRoot;
+    // Reduction to the root node, computing the sum of output elements
+    float sumNode = sum(dataNode, dataSizePerNode);
+    float sumRoot;

-  MPI_CHECK(
-      MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
+    MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));

-  if (commRank == 0) {
-    float average = sumRoot / dataSizeTotal;
-    cout << "Average of square roots is: " << average << endl;
-  }
+    if (commRank == 0) {
+        float average = sumRoot / dataSizeTotal;
+        cout << "Average of square roots is: " << average << endl;
+    }

-  // Cleanup
-  delete[] dataNode;
-  MPI_CHECK(MPI_Finalize());
+    // Cleanup
+    delete[] dataNode;
+    MPI_CHECK(MPI_Finalize());

-  if (commRank == 0) {
-    cout << "PASSED\n";
-  }
+    if (commRank == 0) {
+        cout << "PASSED\n";
+    }

-  return 0;
+    return 0;
 }

 // Shut down MPI cleanly if something goes wrong
-void my_abort(int err) {
-  cout << "Test FAILED\n";
-  MPI_Abort(MPI_COMM_WORLD, err);
+void my_abort(int err)
+{
+    cout << "Test FAILED\n";
+    MPI_Abort(MPI_COMM_WORLD, err);
 }
--- a/Samples/0_Introduction/simpleMPI/simpleMPI.cu
+++ b/Samples/0_Introduction/simpleMPI/simpleMPI.cu
@ -26,14 +26,14 @@
 */

 /* Simple example demonstrating how to use MPI with CUDA
-*
-*  Generate some random numbers on one node.
-*  Dispatch them to all nodes.
-*  Compute their square root on each node's GPU.
-*  Compute the average of the results using MPI.
-*
-*  simpleMPI.cu: GPU part, compiled with nvcc
-*/
+ *
+ *  Generate some random numbers on one node.
+ *  Dispatch them to all nodes.
+ *  Compute their square root on each node's GPU.
+ *  Compute the average of the results using MPI.
+ *
+ *  simpleMPI.cu: GPU part, compiled with nvcc
+ */

 #include <iostream>
 using std::cerr;
@ -42,61 +42,63 @@ using std::endl;
 #include "simpleMPI.h"

 // Error handling macro
-#define CUDA_CHECK(call)                                                 \
-  if ((call) != cudaSuccess) {                                           \
-    cudaError_t err = cudaGetLastError();                                \
-    cerr << "CUDA error calling \"" #call "\", code is " << err << endl; \
-    my_abort(err);                                                       \
-  }
+#define CUDA_CHECK(call)                                                     \
+    if ((call) != cudaSuccess) {                                             \
+        cudaError_t err = cudaGetLastError();                                \
+        cerr << "CUDA error calling \"" #call "\", code is " << err << endl; \
+        my_abort(err);                                                       \
+    }

 // Device code
 // Very simple GPU Kernel that computes square roots of input numbers
-__global__ void simpleMPIKernel(float *input, float *output) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  output[tid] = sqrt(input[tid]);
+__global__ void simpleMPIKernel(float *input, float *output)
+{
+    int tid     = blockIdx.x * blockDim.x + threadIdx.x;
+    output[tid] = sqrt(input[tid]);
 }

 // Initialize an array with random data (between 0 and 1)
-void initData(float *data, int dataSize) {
-  for (int i = 0; i < dataSize; i++) {
-    data[i] = (float)rand() / RAND_MAX;
-  }
+void initData(float *data, int dataSize)
+{
+    for (int i = 0; i < dataSize; i++) {
+        data[i] = (float)rand() / RAND_MAX;
+    }
 }

 // CUDA computation on each node
 // No MPI here, only CUDA
-void computeGPU(float *hostData, int blockSize, int gridSize) {
-  int dataSize = blockSize * gridSize;
+void computeGPU(float *hostData, int blockSize, int gridSize)
+{
+    int dataSize = blockSize * gridSize;

-  // Allocate data on GPU memory
-  float *deviceInputData = NULL;
-  CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float)));
+    // Allocate data on GPU memory
+    float *deviceInputData = NULL;
+    CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float)));

-  float *deviceOutputData = NULL;
-  CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
+    float *deviceOutputData = NULL;
+    CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));

-  // Copy to GPU memory
-  CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float),
-                        cudaMemcpyHostToDevice));
+    // Copy to GPU memory
+    CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));

-  // Run kernel
-  simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
+    // Run kernel
+    simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);

-  // Copy data back to CPU memory
-  CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float),
-                        cudaMemcpyDeviceToHost));
+    // Copy data back to CPU memory
+    CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), cudaMemcpyDeviceToHost));

-  // Free GPU memory
-  CUDA_CHECK(cudaFree(deviceInputData));
-  CUDA_CHECK(cudaFree(deviceOutputData));
+    // Free GPU memory
+    CUDA_CHECK(cudaFree(deviceInputData));
+    CUDA_CHECK(cudaFree(deviceOutputData));
 }

-float sum(float *data, int size) {
-  float accum = 0.f;
+float sum(float *data, int size)
+{
+    float accum = 0.f;

-  for (int i = 0; i < size; i++) {
-    accum += data[i];
-  }
+    for (int i = 0; i < size; i++) {
+        accum += data[i];
+    }

-  return accum;
+    return accum;
 }
--- a/Samples/0_Introduction/simpleMPI/simpleMPI.h
+++ b/Samples/0_Introduction/simpleMPI/simpleMPI.h
@ -26,19 +26,20 @@
 */

 /* Simple example demonstrating how to use MPI with CUDA
-*
-*  Generate some random numbers on one node.
-*  Dispatch them to all nodes.
-*  Compute their square root on each node's GPU.
-*  Compute the average of the results using MPI.
-*
-*  simpleMPI.h: common header file
-*/
+ *
+ *  Generate some random numbers on one node.
+ *  Dispatch them to all nodes.
+ *  Compute their square root on each node's GPU.
+ *  Compute the average of the results using MPI.
+ *
+ *  simpleMPI.h: common header file
+ */

 // Forward declarations
-extern "C" {
-void initData(float *data, int dataSize);
-void computeGPU(float *hostData, int blockSize, int gridSize);
-float sum(float *data, int size);
-void my_abort(int err);
+extern "C"
+{
+    void  initData(float *data, int dataSize);
+    void  computeGPU(float *hostData, int blockSize, int gridSize);
+    float sum(float *data, int size);
+    void  my_abort(int err);
 }
--- a/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu
+++ b/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu
@ -38,7 +38,7 @@
 *
 * Elapsed times are averaged over nreps repetitions (10 by default).
 *
-*/
+ */

 const char *sSDKname = "simpleMultiCopy";

@ -50,25 +50,26 @@ const char *sSDKname = "simpleMultiCopy";

 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
+#include <helper_functions.h> // helper for shared that are common to CUDA Samples

 // includes, kernels
 // Declare the CUDA kernels here and main() code that is needed to launch
 // Compute workload on the system
-__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;

-  if (idx < N) {
-    for (int i = 0; i < inner_reps; ++i) {
-      g_out[idx] = g_in[idx] + 1;
+    if (idx < N) {
+        for (int i = 0; i < inner_reps; ++i) {
+            g_out[idx] = g_in[idx] + 1;
+        }
    }
-  }
 }

 #define STREAM_COUNT 4

 // Uncomment to simulate data source/sink IO times
-//#define SIMULATE_IO
+// #define SIMULATE_IO

 int *h_data_source;
 int *h_data_sink;
@ -79,13 +80,13 @@ int *d_data_in[STREAM_COUNT];
 int *h_data_out[STREAM_COUNT];
 int *d_data_out[STREAM_COUNT];

-cudaEvent_t cycleDone[STREAM_COUNT];
+cudaEvent_t  cycleDone[STREAM_COUNT];
 cudaStream_t stream[STREAM_COUNT];

 cudaEvent_t start, stop;

-int N = 1 << 22;
-int nreps = 10;  // number of times each experiment is repeated
+int N          = 1 << 22;
+int nreps      = 10; // number of times each experiment is repeated
 int inner_reps = 5;

 int memsize;
@ -96,278 +97,268 @@ dim3 grid;
 int thread_blocks;

 float processWithStreams(int streams_used);
-void init();
-bool test();
+void  init();
+bool  test();

 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char *argv[]) {
-  int cuda_device = 0;
-  float scale_factor;
-  cudaDeviceProp deviceProp;
+int main(int argc, char *argv[])
+{
+    int            cuda_device = 0;
+    float          scale_factor;
+    cudaDeviceProp deviceProp;

-  printf("[%s] - Starting...\n", sSDKname);
+    printf("[%s] - Starting...\n", sSDKname);

-  if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
-    cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=");
+    if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
+        cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=");

-    if (cuda_device < 0) {
-      printf("Invalid command line parameters\n");
-      exit(EXIT_FAILURE);
-    } else {
-      printf("cuda_device = %d\n", cuda_device);
-      cuda_device = gpuDeviceInit(cuda_device);
+        if (cuda_device < 0) {
+            printf("Invalid command line parameters\n");
+            exit(EXIT_FAILURE);
+        }
+        else {
+            printf("cuda_device = %d\n", cuda_device);
+            cuda_device = gpuDeviceInit(cuda_device);

-      if (cuda_device < 0) {
-        printf("No CUDA Capable devices found, exiting...\n");
-        exit(EXIT_SUCCESS);
-      }
+            if (cuda_device < 0) {
+                printf("No CUDA Capable devices found, exiting...\n");
+                exit(EXIT_SUCCESS);
+            }
+        }
    }
-  } else {
-    // Otherwise pick the device with the highest Gflops/s
-    cuda_device = gpuGetMaxGflopsDeviceId();
-    checkCudaErrors(cudaSetDevice(cuda_device));
+    else {
+        // Otherwise pick the device with the highest Gflops/s
+        cuda_device = gpuGetMaxGflopsDeviceId();
+        checkCudaErrors(cudaSetDevice(cuda_device));
+        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
+        printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
+    }
+
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
-    printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
-  }
+    printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
+           deviceProp.name,
+           deviceProp.multiProcessorCount,
+           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
+           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
-  printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name,
-         deviceProp.multiProcessorCount,
-         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
-         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
-             deviceProp.multiProcessorCount);
+    // Anything that is less than 32 Cores will have scaled down workload
+    scale_factor =
+        max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
+            1.0f);
+    N = (int)((float)N / scale_factor);

-  // Anything that is less than 32 Cores will have scaled down workload
-  scale_factor =
-      max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
-                    (float)deviceProp.multiProcessorCount)),
-          1.0f);
-  N = (int)((float)N / scale_factor);
+    printf("> Device name: %s\n", deviceProp.name);
+    printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
+           deviceProp.major,
+           deviceProp.minor,
+           deviceProp.multiProcessorCount);
+    printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
+    printf("> array_size   = %d\n\n", N);

-  printf("> Device name: %s\n", deviceProp.name);
-  printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
-         deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
-  printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
-  printf("> array_size   = %d\n\n", N);
+    memsize = N * sizeof(int);

-  memsize = N * sizeof(int);
+    thread_blocks = N / block.x;

-  thread_blocks = N / block.x;
+    grid.x = thread_blocks % 65535;
+    grid.y = (thread_blocks / 65535 + 1);

-  grid.x = thread_blocks % 65535;
-  grid.y = (thread_blocks / 65535 + 1);
+    // Allocate resources

-  // Allocate resources
+    h_data_source = (int *)malloc(memsize);
+    h_data_sink   = (int *)malloc(memsize);

-  h_data_source = (int *)malloc(memsize);
-  h_data_sink = (int *)malloc(memsize);
+    for (int i = 0; i < STREAM_COUNT; ++i) {
+        checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
+        checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
+        checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));

-  for (int i = 0; i < STREAM_COUNT; ++i) {
-    checkCudaErrors(
-        cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
-    checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
-    checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
+        checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
+        checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));

-    checkCudaErrors(
-        cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
-    checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
+        checkCudaErrors(cudaStreamCreate(&stream[i]));
+        checkCudaErrors(cudaEventCreate(&cycleDone[i]));

-    checkCudaErrors(cudaStreamCreate(&stream[i]));
-    checkCudaErrors(cudaEventCreate(&cycleDone[i]));
+        cudaEventRecord(cycleDone[i], stream[i]);
+    }

-    cudaEventRecord(cycleDone[i], stream[i]);
-  }
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);

-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
+    init();

-  init();
+    // Kernel warmup
+    incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);

-  // Kernel warmup
-  incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);
+    // Time copies and kernel
+    cudaEventRecord(start, 0);
+    checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0));
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);

-  // Time copies and kernel
-  cudaEventRecord(start, 0);
-  checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
-                                  cudaMemcpyHostToDevice, 0));
-  cudaEventRecord(stop, 0);
-  cudaEventSynchronize(stop);
+    float memcpy_h2d_time;
+    cudaEventElapsedTime(&memcpy_h2d_time, start, stop);

-  float memcpy_h2d_time;
-  cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
+    cudaEventRecord(start, 0);
+    checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0));
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);

-  cudaEventRecord(start, 0);
-  checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize,
-                                  cudaMemcpyDeviceToHost, 0));
-  cudaEventRecord(stop, 0);
-  cudaEventSynchronize(stop);
+    float memcpy_d2h_time;
+    cudaEventElapsedTime(&memcpy_d2h_time, start, stop);

-  float memcpy_d2h_time;
-  cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
+    cudaEventRecord(start, 0);
+    incKernel<<<grid, block, 0, 0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);

-  cudaEventRecord(start, 0);
-  incKernel<<<grid, block, 0, 0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
-  cudaEventRecord(stop, 0);
-  cudaEventSynchronize(stop);
+    float kernel_time;
+    cudaEventElapsedTime(&kernel_time, start, stop);

-  float kernel_time;
-  cudaEventElapsedTime(&kernel_time, start, stop);
+    printf("\n");
+    printf("Relevant properties of this CUDA device\n");
+    printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
+           "(device property \"deviceOverlap\")\n",
+           deviceProp.deviceOverlap ? "X" : " ");
+    // printf("(%s) Can execute several GPU kernels simultaneously (compute
+    // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
+    printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
+           "    (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
+           "4000/5000/6000/K5000)\n",
+           (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");

-  printf("\n");
-  printf("Relevant properties of this CUDA device\n");
-  printf(
-      "(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
-      "(device property \"deviceOverlap\")\n",
-      deviceProp.deviceOverlap ? "X" : " ");
-  // printf("(%s) Can execute several GPU kernels simultaneously (compute
-  // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
-  printf(
-      "(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
-      "    (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
-      "4000/5000/6000/K5000)\n",
-      (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
+    printf("\n");
+    printf("Measured timings (throughput):\n");
+    printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time);
+    printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time);
+    printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, (inner_reps * memsize * 2e-6) / kernel_time);

-  printf("\n");
-  printf("Measured timings (throughput):\n");
-  printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time,
-         (memsize * 1e-6) / memcpy_h2d_time);
-  printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time,
-         (memsize * 1e-6) / memcpy_d2h_time);
-  printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time,
-         (inner_reps * memsize * 2e-6) / kernel_time);
+    printf("\n");
+    printf("Theoretical limits for speedup gained from overlapped data "
+           "transfers:\n");
+    printf("No overlap at all (transfer-kernel-transfer): %f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time);
+    printf("Compute can overlap with one transfer: %f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
+    printf("Compute can overlap with both data transfers: %f ms\n",
+           max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));

-  printf("\n");
-  printf(
-      "Theoretical limits for speedup gained from overlapped data "
-      "transfers:\n");
-  printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
-         memcpy_h2d_time + memcpy_d2h_time + kernel_time);
-  printf("Compute can overlap with one transfer: %f ms\n",
-         max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
-  printf("Compute can overlap with both data transfers: %f ms\n",
-         max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
+    // Process pipelined work
+    float serial_time  = processWithStreams(1);
+    float overlap_time = processWithStreams(STREAM_COUNT);

-  // Process pipelined work
-  float serial_time = processWithStreams(1);
-  float overlap_time = processWithStreams(STREAM_COUNT);
+    printf("\nAverage measured timings over %d repetitions:\n", nreps);
+    printf(" Avg. time when execution fully serialized\t: %f ms\n", serial_time / nreps);
+    printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT, overlap_time / nreps);
+    printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n", (serial_time - overlap_time) / nreps);

-  printf("\nAverage measured timings over %d repetitions:\n", nreps);
-  printf(" Avg. time when execution fully serialized\t: %f ms\n",
-         serial_time / nreps);
-  printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT,
-         overlap_time / nreps);
-  printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
-         (serial_time - overlap_time) / nreps);
+    printf("\nMeasured throughput:\n");
+    printf(" Fully serialized execution\t\t: %f GB/s\n", (nreps * (memsize * 2e-6)) / serial_time);
+    printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT, (nreps * (memsize * 2e-6)) / overlap_time);

-  printf("\nMeasured throughput:\n");
-  printf(" Fully serialized execution\t\t: %f GB/s\n",
-         (nreps * (memsize * 2e-6)) / serial_time);
-  printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT,
-         (nreps * (memsize * 2e-6)) / overlap_time);
+    // Verify the results, we will use the results for final output
+    bool bResults = test();

-  // Verify the results, we will use the results for final output
-  bool bResults = test();
+    // Free resources

-  // Free resources
+    free(h_data_source);
+    free(h_data_sink);

-  free(h_data_source);
-  free(h_data_sink);
+    for (int i = 0; i < STREAM_COUNT; ++i) {
+        cudaFreeHost(h_data_in[i]);
+        cudaFree(d_data_in[i]);

-  for (int i = 0; i < STREAM_COUNT; ++i) {
-    cudaFreeHost(h_data_in[i]);
-    cudaFree(d_data_in[i]);
+        cudaFreeHost(h_data_out[i]);
+        cudaFree(d_data_out[i]);

-    cudaFreeHost(h_data_out[i]);
-    cudaFree(d_data_out[i]);
+        cudaStreamDestroy(stream[i]);
+        cudaEventDestroy(cycleDone[i]);
+    }

-    cudaStreamDestroy(stream[i]);
-    cudaEventDestroy(cycleDone[i]);
-  }
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);

-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-
-  // Test result
-  exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
+    // Test result
+    exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
 }

-float processWithStreams(int streams_used) {
-  int current_stream = 0;
+float processWithStreams(int streams_used)
+{
+    int current_stream = 0;

-  float time;
+    float time;

-  // Do processing in a loop
-  //
-  // Note: All memory commands are processed in the order  they are issued,
-  // independent of the stream they are enqueued in. Hence the pattern by
-  // which the copy and kernel commands are enqueued in the stream
-  // has an influence on the achieved overlap.
+    // Do processing in a loop
+    //
+    // Note: All memory commands are processed in the order  they are issued,
+    // independent of the stream they are enqueued in. Hence the pattern by
+    // which the copy and kernel commands are enqueued in the stream
+    // has an influence on the achieved overlap.

-  cudaEventRecord(start, 0);
+    cudaEventRecord(start, 0);

-  for (int i = 0; i < nreps; ++i) {
-    int next_stream = (current_stream + 1) % streams_used;
+    for (int i = 0; i < nreps; ++i) {
+        int next_stream = (current_stream + 1) % streams_used;

 #ifdef SIMULATE_IO
-    // Store the result
-    memcpy(h_data_sink, h_data_out[current_stream], memsize);
+        // Store the result
+        memcpy(h_data_sink, h_data_out[current_stream], memsize);

-    // Read new input
-    memcpy(h_data_in[next_stream], h_data_source, memsize);
+        // Read new input
+        memcpy(h_data_in[next_stream], h_data_source, memsize);
 #endif

-    // Ensure that processing and copying of the last cycle has finished
-    cudaEventSynchronize(cycleDone[next_stream]);
+        // Ensure that processing and copying of the last cycle has finished
+        cudaEventSynchronize(cycleDone[next_stream]);

-    // Process current frame
-    incKernel<<<grid, block, 0, stream[current_stream]>>>(
-        d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);
+        // Process current frame
+        incKernel<<<grid, block, 0, stream[current_stream]>>>(
+            d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);

-    // Upload next frame
-    checkCudaErrors(
-        cudaMemcpyAsync(d_data_in[next_stream], h_data_in[next_stream], memsize,
-                        cudaMemcpyHostToDevice, stream[next_stream]));
+        // Upload next frame
+        checkCudaErrors(cudaMemcpyAsync(
+            d_data_in[next_stream], h_data_in[next_stream], memsize, cudaMemcpyHostToDevice, stream[next_stream]));

-    // Download current frame
-    checkCudaErrors(cudaMemcpyAsync(
-        h_data_out[current_stream], d_data_out[current_stream], memsize,
-        cudaMemcpyDeviceToHost, stream[current_stream]));
+        // Download current frame
+        checkCudaErrors(cudaMemcpyAsync(h_data_out[current_stream],
+                                        d_data_out[current_stream],
+                                        memsize,
+                                        cudaMemcpyDeviceToHost,
+                                        stream[current_stream]));

-    checkCudaErrors(
-        cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
+        checkCudaErrors(cudaEventRecord(cycleDone[current_stream], stream[current_stream]));

-    current_stream = next_stream;
-  }
-
-  cudaEventRecord(stop, 0);
-
-  cudaDeviceSynchronize();
-
-  cudaEventElapsedTime(&time, start, stop);
-
-  return time;
-}
-
-void init() {
-  for (int i = 0; i < N; ++i) {
-    h_data_source[i] = 0;
-  }
-
-  for (int i = 0; i < STREAM_COUNT; ++i) {
-    memcpy(h_data_in[i], h_data_source, memsize);
-  }
-}
-
-bool test() {
-  bool passed = true;
-
-  for (int j = 0; j < STREAM_COUNT; ++j) {
-    for (int i = 0; i < N; ++i) {
-      passed &= (h_data_out[j][i] == 1);
+        current_stream = next_stream;
    }
-  }

-  return passed;
+    cudaEventRecord(stop, 0);
+
+    cudaDeviceSynchronize();
+
+    cudaEventElapsedTime(&time, start, stop);
+
+    return time;
+}
+
+void init()
+{
+    for (int i = 0; i < N; ++i) {
+        h_data_source[i] = 0;
+    }
+
+    for (int i = 0; i < STREAM_COUNT; ++i) {
+        memcpy(h_data_in[i], h_data_source, memsize);
+    }
+}
+
+bool test()
+{
+    bool passed = true;
+
+    for (int j = 0; j < STREAM_COUNT; ++j) {
+        for (int i = 0; i < N; ++i) {
+            passed &= (h_data_out[j][i] == 1);
+        }
+    }
+
+    return passed;
 }
--- a/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.cu
+++ b/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.cu
@ -37,15 +37,15 @@
 */

 // System includes
-#include <stdio.h>
 #include <assert.h>
+#include <stdio.h>

 // CUDA runtime
 #include <cuda_runtime.h>

 // helper functions and utilities to work with CUDA
-#include <helper_functions.h>
 #include <helper_cuda.h>
+#include <helper_functions.h>

 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
@ -57,180 +57,176 @@
 // Data configuration
 ////////////////////////////////////////////////////////////////////////////////
 const int MAX_GPU_COUNT = 32;
-const int DATA_N = 1048576 * 32;
+const int DATA_N        = 1048576 * 32;

 ////////////////////////////////////////////////////////////////////////////////
 // Simple reduction kernel.
 // Refer to the 'reduction' CUDA Sample describing
 // reduction optimization strategies
 ////////////////////////////////////////////////////////////////////////////////
-__global__ static void reduceKernel(float *d_Result, float *d_Input, int N) {
-  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const int threadN = gridDim.x * blockDim.x;
-  float sum = 0;
+__global__ static void reduceKernel(float *d_Result, float *d_Input, int N)
+{
+    const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
+    const int threadN = gridDim.x * blockDim.x;
+    float     sum     = 0;

-  for (int pos = tid; pos < N; pos += threadN) sum += d_Input[pos];
+    for (int pos = tid; pos < N; pos += threadN)
+        sum += d_Input[pos];

-  d_Result[tid] = sum;
+    d_Result[tid] = sum;
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  // Solver config
-  TGPUplan plan[MAX_GPU_COUNT];
+int main(int argc, char **argv)
+{
+    // Solver config
+    TGPUplan plan[MAX_GPU_COUNT];

-  // GPU reduction results
-  float h_SumGPU[MAX_GPU_COUNT];
+    // GPU reduction results
+    float h_SumGPU[MAX_GPU_COUNT];

-  float sumGPU;
-  double sumCPU, diff;
+    float  sumGPU;
+    double sumCPU, diff;

-  int i, j, gpuBase, GPU_N;
+    int i, j, gpuBase, GPU_N;

-  const int BLOCK_N = 32;
-  const int THREAD_N = 256;
-  const int ACCUM_N = BLOCK_N * THREAD_N;
+    const int BLOCK_N  = 32;
+    const int THREAD_N = 256;
+    const int ACCUM_N  = BLOCK_N * THREAD_N;

-  printf("Starting simpleMultiGPU\n");
-  checkCudaErrors(cudaGetDeviceCount(&GPU_N));
+    printf("Starting simpleMultiGPU\n");
+    checkCudaErrors(cudaGetDeviceCount(&GPU_N));

-  if (GPU_N > MAX_GPU_COUNT) {
-    GPU_N = MAX_GPU_COUNT;
-  }
-
-  printf("CUDA-capable device count: %i\n", GPU_N);
-
-  printf("Generating input data...\n\n");
-
-  // Subdividing input data across GPUs
-  // Get data sizes for each GPU
-  for (i = 0; i < GPU_N; i++) {
-    plan[i].dataN = DATA_N / GPU_N;
-  }
-
-  // Take into account "odd" data sizes
-  for (i = 0; i < DATA_N % GPU_N; i++) {
-    plan[i].dataN++;
-  }
-
-  // Assign data ranges to GPUs
-  gpuBase = 0;
-
-  for (i = 0; i < GPU_N; i++) {
-    plan[i].h_Sum = h_SumGPU + i;
-    gpuBase += plan[i].dataN;
-  }
-
-  // Create streams for issuing GPU command asynchronously and allocate memory
-  // (GPU and System page-locked)
-  for (i = 0; i < GPU_N; i++) {
-    checkCudaErrors(cudaSetDevice(i));
-    checkCudaErrors(cudaStreamCreate(&plan[i].stream));
-    // Allocate memory
-    checkCudaErrors(
-        cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
-    checkCudaErrors(
-        cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
-    checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device,
-                                   ACCUM_N * sizeof(float)));
-    checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data,
-                                   plan[i].dataN * sizeof(float)));
-
-    for (j = 0; j < plan[i].dataN; j++) {
-      plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
-    }
-  }
-
-  // Start timing and compute on GPU(s)
-  printf("Computing with %d GPUs...\n", GPU_N);
-  // create and start timer
-  StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
-
-  // start the timer
-  sdkStartTimer(&timer);
-
-  // Copy data to GPU, launch the kernel and copy data back. All asynchronously
-  for (i = 0; i < GPU_N; i++) {
-    // Set device
-    checkCudaErrors(cudaSetDevice(i));
-
-    // Copy input data from CPU
-    checkCudaErrors(cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data,
-                                    plan[i].dataN * sizeof(float),
-                                    cudaMemcpyHostToDevice, plan[i].stream));
-
-    // Perform GPU computations
-    reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(
-        plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
-    getLastCudaError("reduceKernel() execution failed.\n");
-
-    // Read back GPU results
-    checkCudaErrors(cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum,
-                                    ACCUM_N * sizeof(float),
-                                    cudaMemcpyDeviceToHost, plan[i].stream));
-  }
-
-  // Process GPU results
-  for (i = 0; i < GPU_N; i++) {
-    float sum;
-
-    // Set device
-    checkCudaErrors(cudaSetDevice(i));
-
-    // Wait for all operations to finish
-    cudaStreamSynchronize(plan[i].stream);
-
-    // Finalize GPU reduction for current subvector
-    sum = 0;
-
-    for (j = 0; j < ACCUM_N; j++) {
-      sum += plan[i].h_Sum_from_device[j];
+    if (GPU_N > MAX_GPU_COUNT) {
+        GPU_N = MAX_GPU_COUNT;
    }

-    *(plan[i].h_Sum) = (float)sum;
+    printf("CUDA-capable device count: %i\n", GPU_N);

-    // Shut down this GPU
-    checkCudaErrors(cudaFreeHost(plan[i].h_Sum_from_device));
-    checkCudaErrors(cudaFree(plan[i].d_Sum));
-    checkCudaErrors(cudaFree(plan[i].d_Data));
-    checkCudaErrors(cudaStreamDestroy(plan[i].stream));
-  }
+    printf("Generating input data...\n\n");

-  sumGPU = 0;
-
-  for (i = 0; i < GPU_N; i++) {
-    sumGPU += h_SumGPU[i];
-  }
-
-  sdkStopTimer(&timer);
-  printf("  GPU Processing time: %f (ms)\n\n", sdkGetTimerValue(&timer));
-  sdkDeleteTimer(&timer);
-
-  // Compute on Host CPU
-  printf("Computing with Host CPU...\n\n");
-
-  sumCPU = 0;
-
-  for (i = 0; i < GPU_N; i++) {
-    for (j = 0; j < plan[i].dataN; j++) {
-      sumCPU += plan[i].h_Data[j];
+    // Subdividing input data across GPUs
+    // Get data sizes for each GPU
+    for (i = 0; i < GPU_N; i++) {
+        plan[i].dataN = DATA_N / GPU_N;
    }
-  }

-  // Compare GPU and CPU results
-  printf("Comparing GPU and Host CPU results...\n");
-  diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);
-  printf("  GPU sum: %f\n  CPU sum: %f\n", sumGPU, sumCPU);
-  printf("  Relative difference: %E \n\n", diff);
+    // Take into account "odd" data sizes
+    for (i = 0; i < DATA_N % GPU_N; i++) {
+        plan[i].dataN++;
+    }

-  // Cleanup and shutdown
-  for (i = 0; i < GPU_N; i++) {
-    checkCudaErrors(cudaSetDevice(i));
-    checkCudaErrors(cudaFreeHost(plan[i].h_Data));
-  }
+    // Assign data ranges to GPUs
+    gpuBase = 0;

-  exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE);
+    for (i = 0; i < GPU_N; i++) {
+        plan[i].h_Sum = h_SumGPU + i;
+        gpuBase += plan[i].dataN;
+    }
+
+    // Create streams for issuing GPU command asynchronously and allocate memory
+    // (GPU and System page-locked)
+    for (i = 0; i < GPU_N; i++) {
+        checkCudaErrors(cudaSetDevice(i));
+        checkCudaErrors(cudaStreamCreate(&plan[i].stream));
+        // Allocate memory
+        checkCudaErrors(cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
+        checkCudaErrors(cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
+        checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device, ACCUM_N * sizeof(float)));
+        checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data, plan[i].dataN * sizeof(float)));
+
+        for (j = 0; j < plan[i].dataN; j++) {
+            plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
+        }
+    }
+
+    // Start timing and compute on GPU(s)
+    printf("Computing with %d GPUs...\n", GPU_N);
+    // create and start timer
+    StopWatchInterface *timer = NULL;
+    sdkCreateTimer(&timer);
+
+    // start the timer
+    sdkStartTimer(&timer);
+
+    // Copy data to GPU, launch the kernel and copy data back. All asynchronously
+    for (i = 0; i < GPU_N; i++) {
+        // Set device
+        checkCudaErrors(cudaSetDevice(i));
+
+        // Copy input data from CPU
+        checkCudaErrors(cudaMemcpyAsync(
+            plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream));
+
+        // Perform GPU computations
+        reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
+        getLastCudaError("reduceKernel() execution failed.\n");
+
+        // Read back GPU results
+        checkCudaErrors(cudaMemcpyAsync(
+            plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream));
+    }
+
+    // Process GPU results
+    for (i = 0; i < GPU_N; i++) {
+        float sum;
+
+        // Set device
+        checkCudaErrors(cudaSetDevice(i));
+
+        // Wait for all operations to finish
+        cudaStreamSynchronize(plan[i].stream);
+
+        // Finalize GPU reduction for current subvector
+        sum = 0;
+
+        for (j = 0; j < ACCUM_N; j++) {
+            sum += plan[i].h_Sum_from_device[j];
+        }
+
+        *(plan[i].h_Sum) = (float)sum;
+
+        // Shut down this GPU
+        checkCudaErrors(cudaFreeHost(plan[i].h_Sum_from_device));
+        checkCudaErrors(cudaFree(plan[i].d_Sum));
+        checkCudaErrors(cudaFree(plan[i].d_Data));
+        checkCudaErrors(cudaStreamDestroy(plan[i].stream));
+    }
+
+    sumGPU = 0;
+
+    for (i = 0; i < GPU_N; i++) {
+        sumGPU += h_SumGPU[i];
+    }
+
+    sdkStopTimer(&timer);
+    printf("  GPU Processing time: %f (ms)\n\n", sdkGetTimerValue(&timer));
+    sdkDeleteTimer(&timer);
+
+    // Compute on Host CPU
+    printf("Computing with Host CPU...\n\n");
+
+    sumCPU = 0;
+
+    for (i = 0; i < GPU_N; i++) {
+        for (j = 0; j < plan[i].dataN; j++) {
+            sumCPU += plan[i].h_Data[j];
+        }
+    }
+
+    // Compare GPU and CPU results
+    printf("Comparing GPU and Host CPU results...\n");
+    diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);
+    printf("  GPU sum: %f\n  CPU sum: %f\n", sumGPU, sumCPU);
+    printf("  Relative difference: %E \n\n", diff);
+
+    // Cleanup and shutdown
+    for (i = 0; i < GPU_N; i++) {
+        checkCudaErrors(cudaSetDevice(i));
+        checkCudaErrors(cudaFreeHost(plan[i].h_Data));
+    }
+
+    exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.h
+++ b/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.h
@ -37,26 +37,26 @@
 #ifndef SIMPLEMULTIGPU_H
 #define SIMPLEMULTIGPU_H

-typedef struct {
-  // Host-side input data
-  int dataN;
-  float *h_Data;
+typedef struct
+{
+    // Host-side input data
+    int    dataN;
+    float *h_Data;

-  // Partial sum for this GPU
-  float *h_Sum;
+    // Partial sum for this GPU
+    float *h_Sum;

-  // Device buffers
-  float *d_Data, *d_Sum;
+    // Device buffers
+    float *d_Data, *d_Sum;

-  // Reduction copied back from GPU
-  float *h_Sum_from_device;
+    // Reduction copied back from GPU
+    float *h_Sum_from_device;

-  // Stream for asynchronous command execution
-  cudaStream_t stream;
+    // Stream for asynchronous command execution
+    cudaStream_t stream;

 } TGPUplan;

-extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N,
-                                    int BLOCK_N, int THREAD_N, cudaStream_t &s);
+extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t &s);

 #endif
--- a/Samples/0_Introduction/simpleOccupancy/simpleOccupancy.cu
+++ b/Samples/0_Introduction/simpleOccupancy/simpleOccupancy.cu
@ -25,8 +25,8 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

+#include <helper_cuda.h> // helper functions for CUDA error check
 #include <iostream>
-#include <helper_cuda.h>  // helper functions for CUDA error check

 const int manualBlockSize = 32;

@ -38,13 +38,14 @@ const int manualBlockSize = 32;
 // execution configuration, including anything the launch configurator
 // API suggests.
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void square(int *array, int arrayCount) {
-  extern __shared__ int dynamicSmem[];
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+__global__ void square(int *array, int arrayCount)
+{
+    extern __shared__ int dynamicSmem[];
+    int                   idx = threadIdx.x + blockIdx.x * blockDim.x;

-  if (idx < arrayCount) {
-    array[idx] *= array[idx];
-  }
+    if (idx < arrayCount) {
+        array[idx] *= array[idx];
+    }
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -58,29 +59,28 @@ __global__ void square(int *array, int arrayCount) {
 // This wrapper routine computes the occupancy of kernel, and reports
 // it in terms of active warps / maximum warps per SM.
 ////////////////////////////////////////////////////////////////////////////////
-static double reportPotentialOccupancy(void *kernel, int blockSize,
-                                       size_t dynamicSMem) {
-  int device;
-  cudaDeviceProp prop;
+static double reportPotentialOccupancy(void *kernel, int blockSize, size_t dynamicSMem)
+{
+    int            device;
+    cudaDeviceProp prop;

-  int numBlocks;
-  int activeWarps;
-  int maxWarps;
+    int numBlocks;
+    int activeWarps;
+    int maxWarps;

-  double occupancy;
+    double occupancy;

-  checkCudaErrors(cudaGetDevice(&device));
-  checkCudaErrors(cudaGetDeviceProperties(&prop, device));
+    checkCudaErrors(cudaGetDevice(&device));
+    checkCudaErrors(cudaGetDeviceProperties(&prop, device));

-  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &numBlocks, kernel, blockSize, dynamicSMem));
+    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, blockSize, dynamicSMem));

-  activeWarps = numBlocks * blockSize / prop.warpSize;
-  maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize;
+    activeWarps = numBlocks * blockSize / prop.warpSize;
+    maxWarps    = prop.maxThreadsPerMultiProcessor / prop.warpSize;

-  occupancy = (double)activeWarps / maxWarps;
+    occupancy = (double)activeWarps / maxWarps;

-  return occupancy;
+    return occupancy;
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -99,65 +99,63 @@ static double reportPotentialOccupancy(void *kernel, int blockSize,
 // This function configures the launch based on the "automatic"
 // argument, records the runtime, and reports occupancy and runtime.
 ////////////////////////////////////////////////////////////////////////////////
-static int launchConfig(int *array, int arrayCount, bool automatic) {
-  int blockSize;
-  int minGridSize;
-  int gridSize;
-  size_t dynamicSMemUsage = 0;
+static int launchConfig(int *array, int arrayCount, bool automatic)
+{
+    int    blockSize;
+    int    minGridSize;
+    int    gridSize;
+    size_t dynamicSMemUsage = 0;

-  cudaEvent_t start;
-  cudaEvent_t end;
+    cudaEvent_t start;
+    cudaEvent_t end;

-  float elapsedTime;
+    float elapsedTime;

-  double potentialOccupancy;
+    double potentialOccupancy;

-  checkCudaErrors(cudaEventCreate(&start));
-  checkCudaErrors(cudaEventCreate(&end));
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&end));

-  if (automatic) {
-    checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
-        &minGridSize, &blockSize, (void *)square, dynamicSMemUsage,
-        arrayCount));
+    if (automatic) {
+        checkCudaErrors(
+            cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)square, dynamicSMemUsage, arrayCount));

-    std::cout << "Suggested block size: " << blockSize << std::endl
-              << "Minimum grid size for maximum occupancy: " << minGridSize
-              << std::endl;
-  } else {
-    // This block size is too small. Given limited number of
-    // active blocks per multiprocessor, the number of active
-    // threads will be limited, and thus unable to achieve maximum
-    // occupancy.
+        std::cout << "Suggested block size: " << blockSize << std::endl
+                  << "Minimum grid size for maximum occupancy: " << minGridSize << std::endl;
+    }
+    else {
+        // This block size is too small. Given limited number of
+        // active blocks per multiprocessor, the number of active
+        // threads will be limited, and thus unable to achieve maximum
+        // occupancy.
+        //
+        blockSize = manualBlockSize;
+    }
+
+    // Round up
    //
-    blockSize = manualBlockSize;
-  }
+    gridSize = (arrayCount + blockSize - 1) / blockSize;

-  // Round up
-  //
-  gridSize = (arrayCount + blockSize - 1) / blockSize;
+    // Launch and profile
+    //
+    checkCudaErrors(cudaEventRecord(start));
+    square<<<gridSize, blockSize, dynamicSMemUsage>>>(array, arrayCount);
+    checkCudaErrors(cudaEventRecord(end));

-  // Launch and profile
-  //
-  checkCudaErrors(cudaEventRecord(start));
-  square<<<gridSize, blockSize, dynamicSMemUsage>>>(array, arrayCount);
-  checkCudaErrors(cudaEventRecord(end));
+    checkCudaErrors(cudaDeviceSynchronize());

-  checkCudaErrors(cudaDeviceSynchronize());
+    // Calculate occupancy
+    //
+    potentialOccupancy = reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);

-  // Calculate occupancy
-  //
-  potentialOccupancy =
-      reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
+    std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%" << std::endl;

-  std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%"
-            << std::endl;
+    // Report elapsed time
+    //
+    checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, end));
+    std::cout << "Elapsed time: " << elapsedTime << "ms" << std::endl;

-  // Report elapsed time
-  //
-  checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, end));
-  std::cout << "Elapsed time: " << elapsedTime << "ms" << std::endl;
-
-  return 0;
+    return 0;
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -166,41 +164,41 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
 // The test generates an array and squares it with a CUDA kernel, then
 // verifies the result.
 ////////////////////////////////////////////////////////////////////////////////
-static int test(bool automaticLaunchConfig, const int count = 1000000) {
-  int *array;
-  int *dArray;
-  int size = count * sizeof(int);
+static int test(bool automaticLaunchConfig, const int count = 1000000)
+{
+    int *array;
+    int *dArray;
+    int  size = count * sizeof(int);

-  array = new int[count];
+    array = new int[count];

-  for (int i = 0; i < count; i += 1) {
-    array[i] = i;
-  }
-
-  checkCudaErrors(cudaMalloc(&dArray, size));
-  checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice));
-
-  for (int i = 0; i < count; i += 1) {
-    array[i] = 0;
-  }
-
-  launchConfig(dArray, count, automaticLaunchConfig);
-
-  checkCudaErrors(cudaMemcpy(array, dArray, size, cudaMemcpyDeviceToHost));
-  checkCudaErrors(cudaFree(dArray));
-
-  // Verify the return data
-  //
-  for (int i = 0; i < count; i += 1) {
-    if (array[i] != i * i) {
-      std::cout << "element " << i << " expected " << i * i << " actual "
-                << array[i] << std::endl;
-      return 1;
+    for (int i = 0; i < count; i += 1) {
+        array[i] = i;
    }
-  }
-  delete[] array;

-  return 0;
+    checkCudaErrors(cudaMalloc(&dArray, size));
+    checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice));
+
+    for (int i = 0; i < count; i += 1) {
+        array[i] = 0;
+    }
+
+    launchConfig(dArray, count, automaticLaunchConfig);
+
+    checkCudaErrors(cudaMemcpy(array, dArray, size, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaFree(dArray));
+
+    // Verify the return data
+    //
+    for (int i = 0; i < count; i += 1) {
+        if (array[i] != i * i) {
+            std::cout << "element " << i << " expected " << i * i << " actual " << array[i] << std::endl;
+            return 1;
+        }
+    }
+    delete[] array;
+
+    return 0;
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -210,31 +208,31 @@ static int test(bool automaticLaunchConfig, const int count = 1000000) {
 // automatically configured launch, and reports the occupancy and
 // performance.
 ////////////////////////////////////////////////////////////////////////////////
-int main() {
-  int status;
+int main()
+{
+    int status;

-  std::cout << "starting Simple Occupancy" << std::endl << std::endl;
+    std::cout << "starting Simple Occupancy" << std::endl << std::endl;

-  std::cout << "[ Manual configuration with " << manualBlockSize
-            << " threads per block ]" << std::endl;
+    std::cout << "[ Manual configuration with " << manualBlockSize << " threads per block ]" << std::endl;

-  status = test(false);
-  if (status) {
-    std::cerr << "Test failed\n" << std::endl;
-    return -1;
-  }
+    status = test(false);
+    if (status) {
+        std::cerr << "Test failed\n" << std::endl;
+        return -1;
+    }

-  std::cout << std::endl;
+    std::cout << std::endl;

-  std::cout << "[ Automatic, occupancy-based configuration ]" << std::endl;
-  status = test(true);
-  if (status) {
-    std::cerr << "Test failed\n" << std::endl;
-    return -1;
-  }
+    std::cout << "[ Automatic, occupancy-based configuration ]" << std::endl;
+    status = test(true);
+    if (status) {
+        std::cerr << "Test failed\n" << std::endl;
+        return -1;
+    }

-  std::cout << std::endl;
-  std::cout << "Test PASSED\n" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Test PASSED\n" << std::endl;

-  return 0;
+    return 0;
 }
--- a/Samples/0_Introduction/simpleP2P/README.md
+++ b/Samples/0_Introduction/simpleP2P/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
-
--- a/Samples/0_Introduction/simpleP2P/simpleP2P.cu
+++ b/Samples/0_Introduction/simpleP2P/simpleP2P.cu
@ -31,230 +31,233 @@
 */

 // includes, system
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>

 // CUDA includes
 #include <cuda_runtime.h>

 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
+#include <helper_functions.h> // helper for shared that are common to CUDA Samples

-__global__ void SimpleKernel(float *src, float *dst) {
-  // Just a dummy kernel, doing enough for us to verify that everything
-  // worked
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  dst[idx] = src[idx] * 2.0f;
+__global__ void SimpleKernel(float *src, float *dst)
+{
+    // Just a dummy kernel, doing enough for us to verify that everything
+    // worked
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    dst[idx]      = src[idx] * 2.0f;
 }

 inline bool IsAppBuiltAs64() { return sizeof(void *) == 8; }

-int main(int argc, char **argv) {
-  printf("[%s] - Starting...\n", argv[0]);
+int main(int argc, char **argv)
+{
+    printf("[%s] - Starting...\n", argv[0]);

-  if (!IsAppBuiltAs64()) {
-    printf(
-        "%s is only supported with on 64-bit OSs and the application must be "
-        "built as a 64-bit target.  Test is being waived.\n",
-        argv[0]);
-    exit(EXIT_WAIVED);
-  }
-
-  // Number of GPUs
-  printf("Checking for multiple GPUs...\n");
-  int gpu_n;
-  checkCudaErrors(cudaGetDeviceCount(&gpu_n));
-  printf("CUDA-capable device count: %i\n", gpu_n);
-
-  if (gpu_n < 2) {
-    printf(
-        "Two or more GPUs with Peer-to-Peer access capability are required for "
-        "%s.\n",
-        argv[0]);
-    printf("Waiving test.\n");
-    exit(EXIT_WAIVED);
-  }
-
-  // Query device properties
-  cudaDeviceProp prop[64];
-  int gpuid[2];  // we want to find the first two GPU's that can support P2P
-
-  for (int i = 0; i < gpu_n; i++) {
-    checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
-  }
-  // Check possibility for peer access
-  printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
-
-  int can_access_peer;
-  int p2pCapableGPUs[2];  // We take only 1 pair of P2P capable GPUs
-  p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
-
-  // Show all the combinations of supported P2P GPUs
-  for (int i = 0; i < gpu_n; i++) {
-    for (int j = 0; j < gpu_n; j++) {
-      if (i == j) {
-        continue;
-      }
-      checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
-      printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[i].name,
-             i, prop[j].name, j, can_access_peer ? "Yes" : "No");
-      if (can_access_peer && p2pCapableGPUs[0] == -1) {
-        p2pCapableGPUs[0] = i;
-        p2pCapableGPUs[1] = j;
-      }
+    if (!IsAppBuiltAs64()) {
+        printf("%s is only supported with on 64-bit OSs and the application must be "
+               "built as a 64-bit target.  Test is being waived.\n",
+               argv[0]);
+        exit(EXIT_WAIVED);
    }
-  }

-  if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
-    printf(
-        "Two or more GPUs with Peer-to-Peer access capability are required for "
-        "%s.\n",
-        argv[0]);
-    printf(
-        "Peer to Peer access is not available amongst GPUs in the system, "
-        "waiving test.\n");
+    // Number of GPUs
+    printf("Checking for multiple GPUs...\n");
+    int gpu_n;
+    checkCudaErrors(cudaGetDeviceCount(&gpu_n));
+    printf("CUDA-capable device count: %i\n", gpu_n);

-    exit(EXIT_WAIVED);
-  }
-
-  // Use first pair of p2p capable GPUs detected.
-  gpuid[0] = p2pCapableGPUs[0];
-  gpuid[1] = p2pCapableGPUs[1];
-
-  // Enable peer access
-  printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0],
-         gpuid[1]);
-  checkCudaErrors(cudaSetDevice(gpuid[0]));
-  checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
-  checkCudaErrors(cudaSetDevice(gpuid[1]));
-  checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[0], 0));
-
-  // Allocate buffers
-  const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
-  printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n",
-         int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
-  checkCudaErrors(cudaSetDevice(gpuid[0]));
-  float *g0;
-  checkCudaErrors(cudaMalloc(&g0, buf_size));
-  checkCudaErrors(cudaSetDevice(gpuid[1]));
-  float *g1;
-  checkCudaErrors(cudaMalloc(&g1, buf_size));
-  float *h0;
-  checkCudaErrors(
-      cudaMallocHost(&h0, buf_size));  // Automatically portable with UVA
-
-  // Create CUDA event handles
-  printf("Creating event handles...\n");
-  cudaEvent_t start_event, stop_event;
-  float time_memcpy;
-  int eventflags = cudaEventBlockingSync;
-  checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
-  checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
-
-  // P2P memcopy() benchmark
-  checkCudaErrors(cudaEventRecord(start_event, 0));
-
-  for (int i = 0; i < 100; i++) {
-    // With UVA we don't need to specify source and target devices, the
-    // runtime figures this out by itself from the pointers
-    // Ping-pong copy between GPUs
-    if (i % 2 == 0) {
-      checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
-    } else {
-      checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
+    if (gpu_n < 2) {
+        printf("Two or more GPUs with Peer-to-Peer access capability are required for "
+               "%s.\n",
+               argv[0]);
+        printf("Waiving test.\n");
+        exit(EXIT_WAIVED);
    }
-  }

-  checkCudaErrors(cudaEventRecord(stop_event, 0));
-  checkCudaErrors(cudaEventSynchronize(stop_event));
-  checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
-  printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
-         gpuid[0], gpuid[1],
-         (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f /
-             1024.0f / 1024.0f);
+    // Query device properties
+    cudaDeviceProp prop[64];
+    int            gpuid[2]; // we want to find the first two GPU's that can support P2P

-  // Prepare host buffer and copy to GPU 0
-  printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);
-
-  for (int i = 0; i < buf_size / sizeof(float); i++) {
-    h0[i] = float(i % 4096);
-  }
-
-  checkCudaErrors(cudaSetDevice(gpuid[0]));
-  checkCudaErrors(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));
-
-  // Kernel launch configuration
-  const dim3 threads(512, 1);
-  const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
-
-  // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
-  // output to the GPU 1 buffer
-  printf(
-      "Run kernel on GPU%d, taking source data from GPU%d and writing to "
-      "GPU%d...\n",
-      gpuid[1], gpuid[0], gpuid[1]);
-  checkCudaErrors(cudaSetDevice(gpuid[1]));
-  SimpleKernel<<<blocks, threads>>>(g0, g1);
-
-  checkCudaErrors(cudaDeviceSynchronize());
-
-  // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
-  // output to the GPU 0 buffer
-  printf(
-      "Run kernel on GPU%d, taking source data from GPU%d and writing to "
-      "GPU%d...\n",
-      gpuid[0], gpuid[1], gpuid[0]);
-  checkCudaErrors(cudaSetDevice(gpuid[0]));
-  SimpleKernel<<<blocks, threads>>>(g1, g0);
-
-  checkCudaErrors(cudaDeviceSynchronize());
-
-  // Copy data back to host and verify
-  printf("Copy data back to host from GPU%d and verify results...\n", gpuid[0]);
-  checkCudaErrors(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault));
-
-  int error_count = 0;
-
-  for (int i = 0; i < buf_size / sizeof(float); i++) {
-    // Re-generate input data and apply 2x '* 2.0f' computation of both
-    // kernel runs
-    if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
-      printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i],
-             (float(i % 4096) * 2.0f * 2.0f));
-
-      if (error_count++ > 10) {
-        break;
-      }
+    for (int i = 0; i < gpu_n; i++) {
+        checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
    }
-  }
+    // Check possibility for peer access
+    printf("\nChecking GPU(s) for support of peer to peer memory access...\n");

-  // Disable peer access (also unregisters memory for non-UVA cases)
-  printf("Disabling peer access...\n");
-  checkCudaErrors(cudaSetDevice(gpuid[0]));
-  checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[1]));
-  checkCudaErrors(cudaSetDevice(gpuid[1]));
-  checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[0]));
+    int can_access_peer;
+    int p2pCapableGPUs[2]; // We take only 1 pair of P2P capable GPUs
+    p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;

-  // Cleanup and shutdown
-  printf("Shutting down...\n");
-  checkCudaErrors(cudaEventDestroy(start_event));
-  checkCudaErrors(cudaEventDestroy(stop_event));
-  checkCudaErrors(cudaSetDevice(gpuid[0]));
-  checkCudaErrors(cudaFree(g0));
-  checkCudaErrors(cudaSetDevice(gpuid[1]));
-  checkCudaErrors(cudaFree(g1));
-  checkCudaErrors(cudaFreeHost(h0));
+    // Show all the combinations of supported P2P GPUs
+    for (int i = 0; i < gpu_n; i++) {
+        for (int j = 0; j < gpu_n; j++) {
+            if (i == j) {
+                continue;
+            }
+            checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
+            printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
+                   prop[i].name,
+                   i,
+                   prop[j].name,
+                   j,
+                   can_access_peer ? "Yes" : "No");
+            if (can_access_peer && p2pCapableGPUs[0] == -1) {
+                p2pCapableGPUs[0] = i;
+                p2pCapableGPUs[1] = j;
+            }
+        }
+    }

-  for (int i = 0; i < gpu_n; i++) {
-    checkCudaErrors(cudaSetDevice(i));
-  }
+    if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
+        printf("Two or more GPUs with Peer-to-Peer access capability are required for "
+               "%s.\n",
+               argv[0]);
+        printf("Peer to Peer access is not available amongst GPUs in the system, "
+               "waiving test.\n");

-  if (error_count != 0) {
-    printf("Test failed!\n");
-    exit(EXIT_FAILURE);
-  } else {
-    printf("Test passed\n");
-    exit(EXIT_SUCCESS);
-  }
+        exit(EXIT_WAIVED);
+    }
+
+    // Use first pair of p2p capable GPUs detected.
+    gpuid[0] = p2pCapableGPUs[0];
+    gpuid[1] = p2pCapableGPUs[1];
+
+    // Enable peer access
+    printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0], gpuid[1]);
+    checkCudaErrors(cudaSetDevice(gpuid[0]));
+    checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
+    checkCudaErrors(cudaSetDevice(gpuid[1]));
+    checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[0], 0));
+
+    // Allocate buffers
+    const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
+    printf(
+        "Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
+    checkCudaErrors(cudaSetDevice(gpuid[0]));
+    float *g0;
+    checkCudaErrors(cudaMalloc(&g0, buf_size));
+    checkCudaErrors(cudaSetDevice(gpuid[1]));
+    float *g1;
+    checkCudaErrors(cudaMalloc(&g1, buf_size));
+    float *h0;
+    checkCudaErrors(cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA
+
+    // Create CUDA event handles
+    printf("Creating event handles...\n");
+    cudaEvent_t start_event, stop_event;
+    float       time_memcpy;
+    int         eventflags = cudaEventBlockingSync;
+    checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
+    checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
+
+    // P2P memcopy() benchmark
+    checkCudaErrors(cudaEventRecord(start_event, 0));
+
+    for (int i = 0; i < 100; i++) {
+        // With UVA we don't need to specify source and target devices, the
+        // runtime figures this out by itself from the pointers
+        // Ping-pong copy between GPUs
+        if (i % 2 == 0) {
+            checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
+        }
+        else {
+            checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
+        }
+    }
+
+    checkCudaErrors(cudaEventRecord(stop_event, 0));
+    checkCudaErrors(cudaEventSynchronize(stop_event));
+    checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
+    printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
+           gpuid[0],
+           gpuid[1],
+           (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);
+
+    // Prepare host buffer and copy to GPU 0
+    printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);
+
+    for (int i = 0; i < buf_size / sizeof(float); i++) {
+        h0[i] = float(i % 4096);
+    }
+
+    checkCudaErrors(cudaSetDevice(gpuid[0]));
+    checkCudaErrors(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));
+
+    // Kernel launch configuration
+    const dim3 threads(512, 1);
+    const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
+
+    // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
+    // output to the GPU 1 buffer
+    printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
+           "GPU%d...\n",
+           gpuid[1],
+           gpuid[0],
+           gpuid[1]);
+    checkCudaErrors(cudaSetDevice(gpuid[1]));
+    SimpleKernel<<<blocks, threads>>>(g0, g1);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
+    // output to the GPU 0 buffer
+    printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
+           "GPU%d...\n",
+           gpuid[0],
+           gpuid[1],
+           gpuid[0]);
+    checkCudaErrors(cudaSetDevice(gpuid[0]));
+    SimpleKernel<<<blocks, threads>>>(g1, g0);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    // Copy data back to host and verify
+    printf("Copy data back to host from GPU%d and verify results...\n", gpuid[0]);
+    checkCudaErrors(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault));
+
+    int error_count = 0;
+
+    for (int i = 0; i < buf_size / sizeof(float); i++) {
+        // Re-generate input data and apply 2x '* 2.0f' computation of both
+        // kernel runs
+        if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
+            printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i], (float(i % 4096) * 2.0f * 2.0f));
+
+            if (error_count++ > 10) {
+                break;
+            }
+        }
+    }
+
+    // Disable peer access (also unregisters memory for non-UVA cases)
+    printf("Disabling peer access...\n");
+    checkCudaErrors(cudaSetDevice(gpuid[0]));
+    checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[1]));
+    checkCudaErrors(cudaSetDevice(gpuid[1]));
+    checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[0]));
+
+    // Cleanup and shutdown
+    printf("Shutting down...\n");
+    checkCudaErrors(cudaEventDestroy(start_event));
+    checkCudaErrors(cudaEventDestroy(stop_event));
+    checkCudaErrors(cudaSetDevice(gpuid[0]));
+    checkCudaErrors(cudaFree(g0));
+    checkCudaErrors(cudaSetDevice(gpuid[1]));
+    checkCudaErrors(cudaFree(g1));
+    checkCudaErrors(cudaFreeHost(h0));
+
+    for (int i = 0; i < gpu_n; i++) {
+        checkCudaErrors(cudaSetDevice(i));
+    }
+
+    if (error_count != 0) {
+        printf("Test failed!\n");
+        exit(EXIT_FAILURE);
+    }
+    else {
+        printf("Test passed\n");
+        exit(EXIT_SUCCESS);
+    }
 }
--- a/Samples/0_Introduction/simplePitchLinearTexture/simplePitchLinearTexture.cu
+++ b/Samples/0_Introduction/simplePitchLinearTexture/simplePitchLinearTexture.cu
@ -26,16 +26,16 @@
 */

 /* pitchLinearTexture
-*
-* This example demonstrates how to use textures bound to pitch linear memory.
-* It performs a shift of matrix elements using wrap addressing mode (aka
-* periodic boundary conditions) on two arrays, a pitch linear and a CUDA array,
-* in order to highlight the differences in using each.
-*
-* Textures binding to pitch linear memory is a new feature in CUDA 2.2,
-* and allows use of texture features such as wrap addressing mode and
-* filtering which are not possible with textures bound to regular linear memory
-*/
+ *
+ * This example demonstrates how to use textures bound to pitch linear memory.
+ * It performs a shift of matrix elements using wrap addressing mode (aka
+ * periodic boundary conditions) on two arrays, a pitch linear and a CUDA array,
+ * in order to highlight the differences in using each.
+ *
+ * Textures binding to pitch linear memory is a new feature in CUDA 2.2,
+ * and allows use of texture features such as wrap addressing mode and
+ * filtering which are not possible with textures bound to regular linear memory
+ */

 // includes, system
 #include <stdio.h>
@ -50,13 +50,13 @@
 #include <cuda_runtime.h>

 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h

 // CUDA helper functions
-#include <helper_cuda.h>  // helper functions for CUDA error check
+#include <helper_cuda.h> // helper functions for CUDA error check

-#define NUM_REPS 100  // number of repetitions performed
-#define TILE_DIM 16   // tile/block size
+#define NUM_REPS 100 // number of repetitions performed
+#define TILE_DIM 16  // tile/block size

 const char *sSDKsample = "simplePitchLinearTexture";

@ -70,29 +70,26 @@ bool bTestResult = true;
 //! Shifts matrix elements using pitch linear array
 //! @param odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void shiftPitchLinear(float *odata, int pitch, int width, int height,
-                                 int shiftX, int shiftY,
-                                 cudaTextureObject_t texRefPL) {
-  int xid = blockIdx.x * blockDim.x + threadIdx.x;
-  int yid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void
+shiftPitchLinear(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefPL)
+{
+    int xid = blockIdx.x * blockDim.x + threadIdx.x;
+    int yid = blockIdx.y * blockDim.y + threadIdx.y;

-  odata[yid * pitch + xid] = tex2D<float>(
-      texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
+    odata[yid * pitch + xid] = tex2D<float>(texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
 }

 ////////////////////////////////////////////////////////////////////////////////
 //! Shifts matrix elements using regular array
 //! @param odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void shiftArray(float *odata, int pitch, int width, int height,
-                           int shiftX, int shiftY,
-                           cudaTextureObject_t texRefArray) {
-  int xid = blockIdx.x * blockDim.x + threadIdx.x;
-  int yid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void
+shiftArray(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefArray)
+{
+    int xid = blockIdx.x * blockDim.x + threadIdx.x;
+    int yid = blockIdx.y * blockDim.y + threadIdx.y;

-  odata[yid * pitch + xid] =
-      tex2D<float>(texRefArray, (xid + shiftX) / (float)width,
-                   (yid + shiftY) / (float)height);
+    odata[yid * pitch + xid] = tex2D<float>(texRefArray, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -102,210 +99,199 @@ void runTest(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  printf("%s starting...\n\n", sSDKsample);
+int main(int argc, char **argv)
+{
+    printf("%s starting...\n\n", sSDKsample);

-  runTest(argc, argv);
+    runTest(argc, argv);

-  printf("%s completed, returned %s\n", sSDKsample,
-         bTestResult ? "OK" : "ERROR!");
-  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    printf("%s completed, returned %s\n", sSDKsample, bTestResult ? "OK" : "ERROR!");
+    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
-  // Set array size
-  const int nx = 2048;
-  const int ny = 2048;
+void runTest(int argc, char **argv)
+{
+    // Set array size
+    const int nx = 2048;
+    const int ny = 2048;

-  // Setup shifts applied to x and y data
-  const int x_shift = 5;
-  const int y_shift = 7;
+    // Setup shifts applied to x and y data
+    const int x_shift = 5;
+    const int y_shift = 7;

-  if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) {
-    printf("nx and ny must be multiples of TILE_DIM\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Setup execution configuration parameters
-  dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);
-
-  // This will pick the best possible CUDA capable device
-  int devID = findCudaDevice(argc, (const char **)argv);
-
-  // CUDA events for timing
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-
-  // Host allocation and initialization
-  float *h_idata = (float *)malloc(sizeof(float) * nx * ny);
-  float *h_odata = (float *)malloc(sizeof(float) * nx * ny);
-  float *gold = (float *)malloc(sizeof(float) * nx * ny);
-
-  for (int i = 0; i < nx * ny; ++i) {
-    h_idata[i] = (float)i;
-  }
-
-  // Device memory allocation
-  // Pitch linear input data
-  float *d_idataPL;
-  size_t d_pitchBytes;
-
-  checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes,
-                                  nx * sizeof(float), ny));
-
-  // Array input data
-  cudaArray *d_idataArray;
-  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-
-  checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny));
-
-  // Pitch linear output data
-  float *d_odata;
-  checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes,
-                                  nx * sizeof(float), ny));
-
-  // Copy host data to device
-  // Pitch linear
-  size_t h_pitchBytes = nx * sizeof(float);
-
-  checkCudaErrors(cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes,
-                               nx * sizeof(float), ny, cudaMemcpyHostToDevice));
-
-  // Array
-  checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata,
-                                    nx * ny * sizeof(float),
-                                    cudaMemcpyHostToDevice));
-
-  cudaTextureObject_t texRefPL;
-  cudaTextureObject_t texRefArray;
-  cudaResourceDesc texRes;
-  memset(&texRes, 0, sizeof(cudaResourceDesc));
-
-  texRes.resType = cudaResourceTypePitch2D;
-  texRes.res.pitch2D.devPtr = d_idataPL;
-  texRes.res.pitch2D.desc = channelDesc;
-  texRes.res.pitch2D.width = nx;
-  texRes.res.pitch2D.height = ny;
-  texRes.res.pitch2D.pitchInBytes = h_pitchBytes;
-  cudaTextureDesc texDescr;
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
-
-  texDescr.normalizedCoords = true;
-  texDescr.filterMode = cudaFilterModePoint;
-  texDescr.addressMode[0] = cudaAddressModeWrap;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
-  texDescr.readMode = cudaReadModeElementType;
-
-  checkCudaErrors(cudaCreateTextureObject(&texRefPL, &texRes, &texDescr, NULL));
-  memset(&texRes, 0, sizeof(cudaResourceDesc));
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
-  texRes.resType = cudaResourceTypeArray;
-  texRes.res.array.array = d_idataArray;
-  texDescr.normalizedCoords = true;
-  texDescr.filterMode = cudaFilterModePoint;
-  texDescr.addressMode[0] = cudaAddressModeWrap;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
-  texDescr.readMode = cudaReadModeElementType;
-  checkCudaErrors(
-      cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
-
-  // Reference calculation
-  for (int j = 0; j < ny; ++j) {
-    int jshift = (j + y_shift) % ny;
-
-    for (int i = 0; i < nx; ++i) {
-      int ishift = (i + x_shift) % nx;
-      gold[j * nx + i] = h_idata[jshift * nx + ishift];
+    if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) {
+        printf("nx and ny must be multiples of TILE_DIM\n");
+        exit(EXIT_FAILURE);
    }
-  }

-  // Run ShiftPitchLinear kernel
-  checkCudaErrors(
-      cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
+    // Setup execution configuration parameters
+    dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);

-  checkCudaErrors(cudaEventRecord(start, 0));
+    // This will pick the best possible CUDA capable device
+    int devID = findCudaDevice(argc, (const char **)argv);

-  for (int i = 0; i < NUM_REPS; ++i) {
-    shiftPitchLinear<<<dimGrid, dimBlock>>>(d_odata,
-                                            (int)(d_pitchBytes / sizeof(float)),
-                                            nx, ny, x_shift, y_shift, texRefPL);
-  }
+    // CUDA events for timing
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);

-  checkCudaErrors(cudaEventRecord(stop, 0));
-  checkCudaErrors(cudaEventSynchronize(stop));
-  float timePL;
-  checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
+    // Host allocation and initialization
+    float *h_idata = (float *)malloc(sizeof(float) * nx * ny);
+    float *h_odata = (float *)malloc(sizeof(float) * nx * ny);
+    float *gold    = (float *)malloc(sizeof(float) * nx * ny);

-  // Check results
-  checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
-                               nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
+    for (int i = 0; i < nx * ny; ++i) {
+        h_idata[i] = (float)i;
+    }

-  bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
+    // Device memory allocation
+    // Pitch linear input data
+    float *d_idataPL;
+    size_t d_pitchBytes;

-  bTestResult = true;
+    checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, nx * sizeof(float), ny));

-  if (res == false) {
-    printf("*** shiftPitchLinear failed ***\n");
-    bTestResult = false;
-  }
+    // Array input data
+    cudaArray            *d_idataArray;
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

-  // Run ShiftArray kernel
-  checkCudaErrors(
-      cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
-  checkCudaErrors(cudaEventRecord(start, 0));
+    checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny));

-  for (int i = 0; i < NUM_REPS; ++i) {
-    shiftArray<<<dimGrid, dimBlock>>>(d_odata,
-                                      (int)(d_pitchBytes / sizeof(float)), nx,
-                                      ny, x_shift, y_shift, texRefArray);
-  }
+    // Pitch linear output data
+    float *d_odata;
+    checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes, nx * sizeof(float), ny));

-  checkCudaErrors(cudaEventRecord(stop, 0));
-  checkCudaErrors(cudaEventSynchronize(stop));
-  float timeArray;
-  checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
+    // Copy host data to device
+    // Pitch linear
+    size_t h_pitchBytes = nx * sizeof(float);

-  // Check results
-  checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
-                               nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
-  res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
+    checkCudaErrors(
+        cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, nx * sizeof(float), ny, cudaMemcpyHostToDevice));

-  if (res == false) {
-    printf("*** shiftArray failed ***\n");
-    bTestResult = false;
-  }
+    // Array
+    checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata, nx * ny * sizeof(float), cudaMemcpyHostToDevice));

-  float bandwidthPL =
-      2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
-  float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) /
-                         (timeArray / NUM_REPS);
+    cudaTextureObject_t texRefPL;
+    cudaTextureObject_t texRefArray;
+    cudaResourceDesc    texRes;
+    memset(&texRes, 0, sizeof(cudaResourceDesc));

-  printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n",
-         bandwidthPL, bandwidthArray);
+    texRes.resType                  = cudaResourceTypePitch2D;
+    texRes.res.pitch2D.devPtr       = d_idataPL;
+    texRes.res.pitch2D.desc         = channelDesc;
+    texRes.res.pitch2D.width        = nx;
+    texRes.res.pitch2D.height       = ny;
+    texRes.res.pitch2D.pitchInBytes = h_pitchBytes;
+    cudaTextureDesc texDescr;
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));

-  float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
-  float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
+    texDescr.normalizedCoords = true;
+    texDescr.filterMode       = cudaFilterModePoint;
+    texDescr.addressMode[0]   = cudaAddressModeWrap;
+    texDescr.addressMode[1]   = cudaAddressModeWrap;
+    texDescr.readMode         = cudaReadModeElementType;

-  printf(
-      "\nTexture fetch rate (Mpix/s) for pitch linear: "
-      "%.2e; for array: %.2e\n\n",
-      fetchRatePL, fetchRateArray);
+    checkCudaErrors(cudaCreateTextureObject(&texRefPL, &texRes, &texDescr, NULL));
+    memset(&texRes, 0, sizeof(cudaResourceDesc));
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    texRes.resType            = cudaResourceTypeArray;
+    texRes.res.array.array    = d_idataArray;
+    texDescr.normalizedCoords = true;
+    texDescr.filterMode       = cudaFilterModePoint;
+    texDescr.addressMode[0]   = cudaAddressModeWrap;
+    texDescr.addressMode[1]   = cudaAddressModeWrap;
+    texDescr.readMode         = cudaReadModeElementType;
+    checkCudaErrors(cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));

-  // Cleanup
-  free(h_idata);
-  free(h_odata);
-  free(gold);
+    // Reference calculation
+    for (int j = 0; j < ny; ++j) {
+        int jshift = (j + y_shift) % ny;

-  checkCudaErrors(cudaDestroyTextureObject(texRefPL));
-  checkCudaErrors(cudaDestroyTextureObject(texRefArray));
-  checkCudaErrors(cudaFree(d_idataPL));
-  checkCudaErrors(cudaFreeArray(d_idataArray));
-  checkCudaErrors(cudaFree(d_odata));
+        for (int i = 0; i < nx; ++i) {
+            int ishift       = (i + x_shift) % nx;
+            gold[j * nx + i] = h_idata[jshift * nx + ishift];
+        }
+    }

-  checkCudaErrors(cudaEventDestroy(start));
-  checkCudaErrors(cudaEventDestroy(stop));
+    // Run ShiftPitchLinear kernel
+    checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
+
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    for (int i = 0; i < NUM_REPS; ++i) {
+        shiftPitchLinear<<<dimGrid, dimBlock>>>(
+            d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefPL);
+    }
+
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    float timePL;
+    checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
+
+    // Check results
+    checkCudaErrors(
+        cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
+
+    bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
+
+    bTestResult = true;
+
+    if (res == false) {
+        printf("*** shiftPitchLinear failed ***\n");
+        bTestResult = false;
+    }
+
+    // Run ShiftArray kernel
+    checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    for (int i = 0; i < NUM_REPS; ++i) {
+        shiftArray<<<dimGrid, dimBlock>>>(
+            d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefArray);
+    }
+
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    float timeArray;
+    checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
+
+    // Check results
+    checkCudaErrors(
+        cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
+    res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
+
+    if (res == false) {
+        printf("*** shiftArray failed ***\n");
+        bTestResult = false;
+    }
+
+    float bandwidthPL    = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
+    float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timeArray / NUM_REPS);
+
+    printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n", bandwidthPL, bandwidthArray);
+
+    float fetchRatePL    = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
+    float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
+
+    printf("\nTexture fetch rate (Mpix/s) for pitch linear: "
+           "%.2e; for array: %.2e\n\n",
+           fetchRatePL,
+           fetchRateArray);
+
+    // Cleanup
+    free(h_idata);
+    free(h_odata);
+    free(gold);
+
+    checkCudaErrors(cudaDestroyTextureObject(texRefPL));
+    checkCudaErrors(cudaDestroyTextureObject(texRefArray));
+    checkCudaErrors(cudaFree(d_idataPL));
+    checkCudaErrors(cudaFreeArray(d_idataArray));
+    checkCudaErrors(cudaFree(d_odata));
+
+    checkCudaErrors(cudaEventDestroy(start));
+    checkCudaErrors(cudaEventDestroy(stop));
 }
--- a/Samples/0_Introduction/simplePrintf/simplePrintf.cu
+++ b/Samples/0_Introduction/simplePrintf/simplePrintf.cu
@ -26,48 +26,49 @@
 */

 // System includes
-#include <stdio.h>
 #include <assert.h>
+#include <stdio.h>

 // CUDA runtime
 #include <cuda_runtime.h>

 // helper functions and utilities to work with CUDA
-#include <helper_functions.h>
 #include <helper_cuda.h>
+#include <helper_functions.h>

 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
 #endif

-__global__ void testKernel(int val) {
-  printf("[%d, %d]:\t\tValue is:%d\n", blockIdx.y * gridDim.x + blockIdx.x,
-         threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
-             threadIdx.x,
-         val);
+__global__ void testKernel(int val)
+{
+    printf("[%d, %d]:\t\tValue is:%d\n",
+           blockIdx.y * gridDim.x + blockIdx.x,
+           threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x,
+           val);
 }

-int main(int argc, char **argv) {
-  int devID;
-  cudaDeviceProp props;
+int main(int argc, char **argv)
+{
+    int            devID;
+    cudaDeviceProp props;

-  // This will pick the best possible CUDA capable device
-  devID = findCudaDevice(argc, (const char **)argv);
+    // This will pick the best possible CUDA capable device
+    devID = findCudaDevice(argc, (const char **)argv);

-  // Get GPU information
-  checkCudaErrors(cudaGetDevice(&devID));
-  checkCudaErrors(cudaGetDeviceProperties(&props, devID));
-  printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name,
-         props.major, props.minor);
+    // Get GPU information
+    checkCudaErrors(cudaGetDevice(&devID));
+    checkCudaErrors(cudaGetDeviceProperties(&props, devID));
+    printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name, props.major, props.minor);

-  printf("printf() is called. Output:\n\n");
+    printf("printf() is called. Output:\n\n");

-  // Kernel configuration, where a two-dimensional grid and
-  // three-dimensional blocks are configured.
-  dim3 dimGrid(2, 2);
-  dim3 dimBlock(2, 2, 2);
-  testKernel<<<dimGrid, dimBlock>>>(10);
-  cudaDeviceSynchronize();
+    // Kernel configuration, where a two-dimensional grid and
+    // three-dimensional blocks are configured.
+    dim3 dimGrid(2, 2);
+    dim3 dimBlock(2, 2, 2);
+    testKernel<<<dimGrid, dimBlock>>>(10);
+    cudaDeviceSynchronize();

-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/simpleStreams/simpleStreams.cu
+++ b/Samples/0_Introduction/simpleStreams/simpleStreams.cu
@ -44,141 +44,137 @@
 *
 * Elapsed times are averaged over nreps repetitions (10 by default).
 *
-*/
+ */

 const char *sSDKsample = "simpleStreams";

-const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync",
-                                  "cudaEventDisableTiming", NULL};
+const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync", "cudaEventDisableTiming", NULL};

-const char *sDeviceSyncMethod[] = {
-    "cudaDeviceScheduleAuto",         "cudaDeviceScheduleSpin",
-    "cudaDeviceScheduleYield",        "INVALID",
-    "cudaDeviceScheduleBlockingSync", NULL};
+const char *sDeviceSyncMethod[] = {"cudaDeviceScheduleAuto",
+                                   "cudaDeviceScheduleSpin",
+                                   "cudaDeviceScheduleYield",
+                                   "INVALID",
+                                   "cudaDeviceScheduleBlockingSync",
+                                   NULL};

 // System includes
-#include <stdio.h>
 #include <assert.h>
+#include <stdio.h>

 // CUDA runtime
 #include <cuda_runtime.h>

 // helper functions and utilities to work with CUDA
-#include <helper_functions.h>
 #include <helper_cuda.h>
+#include <helper_functions.h>

 #ifndef WIN32
-#include <sys/mman.h>  // for mmap() / munmap()
+#include <sys/mman.h> // for mmap() / munmap()
 #endif

 // Macro to aligned up to the memory size in question
-#define MEMORY_ALIGNMENT 4096
+#define MEMORY_ALIGNMENT  4096
 #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))

-__global__ void init_array(int *g_data, int *factor, int num_iterations) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void init_array(int *g_data, int *factor, int num_iterations)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;

-  for (int i = 0; i < num_iterations; i++) {
-    g_data[idx] += *factor;  // non-coalesced on purpose, to burn time
-  }
-}
-
-bool correct_data(int *a, const int n, const int c) {
-  for (int i = 0; i < n; i++) {
-    if (a[i] != c) {
-      printf("%d: %d %d\n", i, a[i], c);
-      return false;
+    for (int i = 0; i < num_iterations; i++) {
+        g_data[idx] += *factor; // non-coalesced on purpose, to burn time
    }
-  }
-
-  return true;
 }

-inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a,
-                               int **ppAligned_a, int nbytes) {
+bool correct_data(int *a, const int n, const int c)
+{
+    for (int i = 0; i < n; i++) {
+        if (a[i] != c) {
+            printf("%d: %d %d\n", i, a[i], c);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
+{
 #if CUDART_VERSION >= 4000
 #if !defined(__arm__) && !defined(__aarch64__)
-  if (bPinGenericMemory) {
+    if (bPinGenericMemory) {
 // allocate a generic page-aligned chunk of system memory
 #ifdef WIN32
-    printf(
-        "> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
-        "system memory)\n",
-        (float)nbytes / 1048576.0f);
-    *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT),
-                                MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+        printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
+               "system memory)\n",
+               (float)nbytes / 1048576.0f);
+        *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
 #else
-    printf(
-        "> mmap() allocating %4.2f Mbytes (generic page-aligned system "
-        "memory)\n",
-        (float)nbytes / 1048576.0f);
-    *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT),
-                        PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+        printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system "
+               "memory)\n",
+               (float)nbytes / 1048576.0f);
+        *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
 #endif

-    *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);
+        *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);

-    printf(
-        "> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
-        "system memory\n",
-        (float)nbytes / 1048576.0f);
-    // pin allocate memory
-    checkCudaErrors(
-        cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
-  } else
+        printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
+               "system memory\n",
+               (float)nbytes / 1048576.0f);
+        // pin allocate memory
+        checkCudaErrors(cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
+    }
+    else
 #endif
 #endif
-  {
-    printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n",
-           (float)nbytes / 1048576.0f);
-    // allocate host memory (pinned is required for achieve asynchronicity)
-    checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
-    *ppAligned_a = *pp_a;
-  }
+    {
+        printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes / 1048576.0f);
+        // allocate host memory (pinned is required for achieve asynchronicity)
+        checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
+        *ppAligned_a = *pp_a;
+    }
 }

-inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
-                           int **ppAligned_a, int nbytes) {
+inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
+{
 #if CUDART_VERSION >= 4000
 #if !defined(__arm__) && !defined(__aarch64__)
-  // CUDA 4.0 support pinning of generic host memory
-  if (bPinGenericMemory) {
-    // unpin and delete host memory
-    checkCudaErrors(cudaHostUnregister(*ppAligned_a));
+    // CUDA 4.0 support pinning of generic host memory
+    if (bPinGenericMemory) {
+        // unpin and delete host memory
+        checkCudaErrors(cudaHostUnregister(*ppAligned_a));
 #ifdef WIN32
-    VirtualFree(*pp_a, 0, MEM_RELEASE);
+        VirtualFree(*pp_a, 0, MEM_RELEASE);
 #else
-    munmap(*pp_a, nbytes);
+        munmap(*pp_a, nbytes);
 #endif
-  } else
+    }
+    else
 #endif
 #endif
-  {
-    cudaFreeHost(*pp_a);
-  }
+    {
+        cudaFreeHost(*pp_a);
+    }
 }

-static const char *sSyncMethod[] = {
-    "0 (Automatic Blocking)",
-    "1 (Spin Blocking)",
-    "2 (Yield Blocking)",
-    "3 (Undefined Blocking Method)",
-    "4 (Blocking Sync Event) = low CPU utilization",
-    NULL};
+static const char *sSyncMethod[] = {"0 (Automatic Blocking)",
+                                    "1 (Spin Blocking)",
+                                    "2 (Yield Blocking)",
+                                    "3 (Undefined Blocking Method)",
+                                    "4 (Blocking Sync Event) = low CPU utilization",
+                                    NULL};

-void printHelp() {
-  printf("Usage: %s [options below]\n", sSDKsample);
-  printf("\t--sync_method=n for CPU/GPU synchronization\n");
-  printf("\t             n=%s\n", sSyncMethod[0]);
-  printf("\t             n=%s\n", sSyncMethod[1]);
-  printf("\t             n=%s\n", sSyncMethod[2]);
-  printf("\t   <Default> n=%s\n", sSyncMethod[4]);
-  printf(
-      "\t--use_generic_memory (default) use generic page-aligned for system "
-      "memory\n");
-  printf(
-      "\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
-      "system memory\n");
+void printHelp()
+{
+    printf("Usage: %s [options below]\n", sSDKsample);
+    printf("\t--sync_method=n for CPU/GPU synchronization\n");
+    printf("\t             n=%s\n", sSyncMethod[0]);
+    printf("\t             n=%s\n", sSyncMethod[1]);
+    printf("\t             n=%s\n", sSyncMethod[2]);
+    printf("\t   <Default> n=%s\n", sSyncMethod[4]);
+    printf("\t--use_generic_memory (default) use generic page-aligned for system "
+           "memory\n");
+    printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
+           "system memory\n");
 }

 #if defined(__APPLE__) || defined(MACOSX)
@ -187,259 +183,240 @@ void printHelp() {
 #define DEFAULT_PINNED_GENERIC_MEMORY true
 #endif

-int main(int argc, char **argv) {
-  int cuda_device = 0;
-  int nstreams = 4;              // number of streams for CUDA calls
-  int nreps = 10;                // number of times each experiment is repeated
-  int n = 16 * 1024 * 1024;      // number of ints in the data set
-  int nbytes = n * sizeof(int);  // number of data bytes
-  dim3 threads, blocks;          // kernel launch configuration
-  float elapsed_time, time_memcpy, time_kernel;  // timing variables
-  float scale_factor = 1.0f;
+int main(int argc, char **argv)
+{
+    int   cuda_device = 0;
+    int   nstreams    = 4;                        // number of streams for CUDA calls
+    int   nreps       = 10;                       // number of times each experiment is repeated
+    int   n           = 16 * 1024 * 1024;         // number of ints in the data set
+    int   nbytes      = n * sizeof(int);          // number of data bytes
+    dim3  threads, blocks;                        // kernel launch configuration
+    float elapsed_time, time_memcpy, time_kernel; // timing variables
+    float scale_factor = 1.0f;

-  // allocate generic memory and pin it laster instead of using cudaHostAlloc()
+    // allocate generic memory and pin it laster instead of using cudaHostAlloc()

-  bool bPinGenericMemory =
-      DEFAULT_PINNED_GENERIC_MEMORY;  // we want this to be the default behavior
-  int device_sync_method =
-      cudaDeviceBlockingSync;  // by default we use BlockingSync
+    bool bPinGenericMemory  = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior
+    int  device_sync_method = cudaDeviceBlockingSync;        // by default we use BlockingSync

-  int niterations;  // number of iterations for the loop inside the kernel
+    int niterations; // number of iterations for the loop inside the kernel

-  printf("[ %s ]\n\n", sSDKsample);
+    printf("[ %s ]\n\n", sSDKsample);

-  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
-    printHelp();
-    return EXIT_SUCCESS;
-  }
-
-  if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv,
-                                                  "sync_method")) >= 0) {
-    if (device_sync_method == 0 || device_sync_method == 1 ||
-        device_sync_method == 2 || device_sync_method == 4) {
-      printf("Device synchronization method set to = %s\n",
-             sSyncMethod[device_sync_method]);
-      printf("Setting reps to 100 to demonstrate steady state\n");
-      nreps = 100;
-    } else {
-      printf("Invalid command line option sync_method=\"%d\"\n",
-             device_sync_method);
-      return EXIT_FAILURE;
+    if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
+        printHelp();
+        return EXIT_SUCCESS;
    }
-  } else {
-    printHelp();
-    return EXIT_SUCCESS;
-  }

-  if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
+    if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv, "sync_method")) >= 0) {
+        if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4) {
+            printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]);
+            printf("Setting reps to 100 to demonstrate steady state\n");
+            nreps = 100;
+        }
+        else {
+            printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method);
+            return EXIT_FAILURE;
+        }
+    }
+    else {
+        printHelp();
+        return EXIT_SUCCESS;
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
 #if defined(__APPLE__) || defined(MACOSX)
-    bPinGenericMemory = false;  // Generic Pinning of System Paged memory not
-                                // currently supported on Mac OSX
+        bPinGenericMemory = false; // Generic Pinning of System Paged memory not
+                                   // currently supported on Mac OSX
 #else
-    bPinGenericMemory = true;
+        bPinGenericMemory = true;
 #endif
-  }
-
-  if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host")) {
-    bPinGenericMemory = false;
-  }
-
-  printf("\n> ");
-  cuda_device = findCudaDevice(argc, (const char **)argv);
-
-  // check the compute capability of the device
-  int num_devices = 0;
-  checkCudaErrors(cudaGetDeviceCount(&num_devices));
-
-  if (0 == num_devices) {
-    printf(
-        "your system does not have a CUDA capable device, waiving test...\n");
-    return EXIT_WAIVED;
-  }
-
-  // check if the command-line chosen device ID is within range, exit if not
-  if (cuda_device >= num_devices) {
-    printf(
-        "cuda_device=%d is invalid, must choose device ID between 0 and %d\n",
-        cuda_device, num_devices - 1);
-    return EXIT_FAILURE;
-  }
-
-  checkCudaErrors(cudaSetDevice(cuda_device));
-
-  // Checking for compute capabilities
-  cudaDeviceProp deviceProp;
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
-
-  niterations = 5;
-
-  // Check if GPU can map host memory (Generic Method), if not then we override
-  // bPinGenericMemory to be false
-  if (bPinGenericMemory) {
-    printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name,
-           deviceProp.canMapHostMemory ? "Yes" : "No");
-
-    if (deviceProp.canMapHostMemory == 0) {
-      printf(
-          "Using cudaMallocHost, CUDA device does not support mapping of "
-          "generic host memory\n");
-      bPinGenericMemory = false;
    }
-  }

-  // Anything that is less than 32 Cores will have scaled down workload
-  scale_factor =
-      max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
-                    (float)deviceProp.multiProcessorCount)),
-          1.0f);
-  n = (int)rint((float)n / scale_factor);
+    if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host")) {
+        bPinGenericMemory = false;
+    }

-  printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major,
-         deviceProp.minor);
-  printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
-         deviceProp.multiProcessorCount,
-         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
-         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
-             deviceProp.multiProcessorCount);
+    printf("\n> ");
+    cuda_device = findCudaDevice(argc, (const char **)argv);

-  printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
-  printf("> array_size   = %d\n\n", n);
+    // check the compute capability of the device
+    int num_devices = 0;
+    checkCudaErrors(cudaGetDeviceCount(&num_devices));

-  // enable use of blocking sync, to reduce CPU usage
-  printf("> Using CPU/GPU Device Synchronization method (%s)\n",
-         sDeviceSyncMethod[device_sync_method]);
-  checkCudaErrors(cudaSetDeviceFlags(
-      device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
+    if (0 == num_devices) {
+        printf("your system does not have a CUDA capable device, waiving test...\n");
+        return EXIT_WAIVED;
+    }

-  // allocate host memory
-  int c = 5;            // value to which the array will be initialized
-  int *h_a = 0;         // pointer to the array data in host memory
-  int *hAligned_a = 0;  // pointer to the array data in host memory (aligned to
-                        // MEMORY_ALIGNMENT)
+    // check if the command-line chosen device ID is within range, exit if not
+    if (cuda_device >= num_devices) {
+        printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices - 1);
+        return EXIT_FAILURE;
+    }

-  // Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if
-  // using the new CUDA 4.0 features
-  AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
+    checkCudaErrors(cudaSetDevice(cuda_device));

-  // allocate device memory
-  int *d_a = 0,
-      *d_c = 0;  // pointers to data and init value in the device memory
-  checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
-  checkCudaErrors(cudaMemset(d_a, 0x0, nbytes));
-  checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int)));
-  checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice));
+    // Checking for compute capabilities
+    cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));

-  printf("\nStarting Test\n");
+    niterations = 5;

-  // allocate and initialize an array of stream handles
-  cudaStream_t *streams =
-      (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
+    // Check if GPU can map host memory (Generic Method), if not then we override
+    // bPinGenericMemory to be false
+    if (bPinGenericMemory) {
+        printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No");

-  for (int i = 0; i < nstreams; i++) {
-    checkCudaErrors(cudaStreamCreate(&(streams[i])));
-  }
+        if (deviceProp.canMapHostMemory == 0) {
+            printf("Using cudaMallocHost, CUDA device does not support mapping of "
+                   "generic host memory\n");
+            bPinGenericMemory = false;
+        }
+    }

-  // create CUDA event handles
-  // use blocking sync
-  cudaEvent_t start_event, stop_event;
-  int eventflags =
-      ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync
-                                                      : cudaEventDefault);
+    // Anything that is less than 32 Cores will have scaled down workload
+    scale_factor =
+        max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
+            1.0f);
+    n = (int)rint((float)n / scale_factor);

-  checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
-  checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
+    printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
+    printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
+           deviceProp.multiProcessorCount,
+           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
+           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

-  // time memcopy from device
-  checkCudaErrors(cudaEventRecord(start_event, 0));  // record in stream-0, to
-                                                     // ensure that all previous
-                                                     // CUDA calls have
-                                                     // completed
-  checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes,
-                                  cudaMemcpyDeviceToHost, streams[0]));
-  checkCudaErrors(cudaEventRecord(stop_event, 0));
-  checkCudaErrors(cudaEventSynchronize(
-      stop_event));  // block until the event is actually recorded
-  checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
-  printf("memcopy:\t%.2f\n", time_memcpy);
+    printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
+    printf("> array_size   = %d\n\n", n);

-  // time kernel
-  threads = dim3(512, 1);
-  blocks = dim3(n / threads.x, 1);
-  checkCudaErrors(cudaEventRecord(start_event, 0));
-  init_array<<<blocks, threads, 0, streams[0]>>>(d_a, d_c, niterations);
-  checkCudaErrors(cudaEventRecord(stop_event, 0));
-  checkCudaErrors(cudaEventSynchronize(stop_event));
-  checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event));
-  printf("kernel:\t\t%.2f\n", time_kernel);
+    // enable use of blocking sync, to reduce CPU usage
+    printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]);
+    checkCudaErrors(cudaSetDeviceFlags(device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));

-  //////////////////////////////////////////////////////////////////////
-  // time non-streamed execution for reference
-  threads = dim3(512, 1);
-  blocks = dim3(n / threads.x, 1);
-  checkCudaErrors(cudaEventRecord(start_event, 0));
+    // allocate host memory
+    int  c          = 5; // value to which the array will be initialized
+    int *h_a        = 0; // pointer to the array data in host memory
+    int *hAligned_a = 0; // pointer to the array data in host memory (aligned to
+                         // MEMORY_ALIGNMENT)

-  for (int k = 0; k < nreps; k++) {
-    init_array<<<blocks, threads>>>(d_a, d_c, niterations);
-    checkCudaErrors(
-        cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
-  }
+    // Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if
+    // using the new CUDA 4.0 features
+    AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);

-  checkCudaErrors(cudaEventRecord(stop_event, 0));
-  checkCudaErrors(cudaEventSynchronize(stop_event));
-  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
-  printf("non-streamed:\t%.2f\n", elapsed_time / nreps);
+    // allocate device memory
+    int *d_a = 0,
+        *d_c = 0; // pointers to data and init value in the device memory
+    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
+    checkCudaErrors(cudaMemset(d_a, 0x0, nbytes));
+    checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int)));
+    checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice));

-  //////////////////////////////////////////////////////////////////////
-  // time execution with nstreams streams
-  threads = dim3(512, 1);
-  blocks = dim3(n / (nstreams * threads.x), 1);
-  memset(hAligned_a, 255,
-         nbytes);  // set host memory bits to all 1s, for testing correctness
-  checkCudaErrors(cudaMemset(
-      d_a, 0, nbytes));  // set device memory to all 0s, for testing correctness
-  checkCudaErrors(cudaEventRecord(start_event, 0));
+    printf("\nStarting Test\n");
+
+    // allocate and initialize an array of stream handles
+    cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));

-  for (int k = 0; k < nreps; k++) {
-    // asynchronously launch nstreams kernels, each operating on its own portion
-    // of data
    for (int i = 0; i < nstreams; i++) {
-      init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams,
-                                                     d_c, niterations);
+        checkCudaErrors(cudaStreamCreate(&(streams[i])));
    }

-    // asynchronously launch nstreams memcopies.  Note that memcopy in stream x
-    // will only
-    //   commence executing when all previous CUDA calls in stream x have
-    //   completed
+    // create CUDA event handles
+    // use blocking sync
+    cudaEvent_t start_event, stop_event;
+    int eventflags = ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync : cudaEventDefault);
+
+    checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
+    checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
+
+    // time memcopy from device
+    checkCudaErrors(cudaEventRecord(start_event, 0)); // record in stream-0, to
+                                                      // ensure that all previous
+                                                      // CUDA calls have
+                                                      // completed
+    checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]));
+    checkCudaErrors(cudaEventRecord(stop_event, 0));
+    checkCudaErrors(cudaEventSynchronize(stop_event)); // block until the event is actually recorded
+    checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
+    printf("memcopy:\t%.2f\n", time_memcpy);
+
+    // time kernel
+    threads = dim3(512, 1);
+    blocks  = dim3(n / threads.x, 1);
+    checkCudaErrors(cudaEventRecord(start_event, 0));
+    init_array<<<blocks, threads, 0, streams[0]>>>(d_a, d_c, niterations);
+    checkCudaErrors(cudaEventRecord(stop_event, 0));
+    checkCudaErrors(cudaEventSynchronize(stop_event));
+    checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event));
+    printf("kernel:\t\t%.2f\n", time_kernel);
+
+    //////////////////////////////////////////////////////////////////////
+    // time non-streamed execution for reference
+    threads = dim3(512, 1);
+    blocks  = dim3(n / threads.x, 1);
+    checkCudaErrors(cudaEventRecord(start_event, 0));
+
+    for (int k = 0; k < nreps; k++) {
+        init_array<<<blocks, threads>>>(d_a, d_c, niterations);
+        checkCudaErrors(cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
+    }
+
+    checkCudaErrors(cudaEventRecord(stop_event, 0));
+    checkCudaErrors(cudaEventSynchronize(stop_event));
+    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
+    printf("non-streamed:\t%.2f\n", elapsed_time / nreps);
+
+    //////////////////////////////////////////////////////////////////////
+    // time execution with nstreams streams
+    threads = dim3(512, 1);
+    blocks  = dim3(n / (nstreams * threads.x), 1);
+    memset(hAligned_a, 255,
+           nbytes);                              // set host memory bits to all 1s, for testing correctness
+    checkCudaErrors(cudaMemset(d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness
+    checkCudaErrors(cudaEventRecord(start_event, 0));
+
+    for (int k = 0; k < nreps; k++) {
+        // asynchronously launch nstreams kernels, each operating on its own portion
+        // of data
+        for (int i = 0; i < nstreams; i++) {
+            init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_c, niterations);
+        }
+
+        // asynchronously launch nstreams memcopies.  Note that memcopy in stream x
+        // will only
+        //   commence executing when all previous CUDA calls in stream x have
+        //   completed
+        for (int i = 0; i < nstreams; i++) {
+            checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
+                                            d_a + i * n / nstreams,
+                                            nbytes / nstreams,
+                                            cudaMemcpyDeviceToHost,
+                                            streams[i]));
+        }
+    }
+
+    checkCudaErrors(cudaEventRecord(stop_event, 0));
+    checkCudaErrors(cudaEventSynchronize(stop_event));
+    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
+    printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);
+
+    // check whether the output is correct
+    printf("-------------------------------\n");
+    bool bResults = correct_data(hAligned_a, n, c * nreps * niterations);
+
+    // release resources
    for (int i = 0; i < nstreams; i++) {
-      checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
-                                      d_a + i * n / nstreams, nbytes / nstreams,
-                                      cudaMemcpyDeviceToHost, streams[i]));
+        checkCudaErrors(cudaStreamDestroy(streams[i]));
    }
-  }

-  checkCudaErrors(cudaEventRecord(stop_event, 0));
-  checkCudaErrors(cudaEventSynchronize(stop_event));
-  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
-  printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);
+    checkCudaErrors(cudaEventDestroy(start_event));
+    checkCudaErrors(cudaEventDestroy(stop_event));

-  // check whether the output is correct
-  printf("-------------------------------\n");
-  bool bResults = correct_data(hAligned_a, n, c * nreps * niterations);
+    // Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)
+    FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);

-  // release resources
-  for (int i = 0; i < nstreams; i++) {
-    checkCudaErrors(cudaStreamDestroy(streams[i]));
-  }
+    checkCudaErrors(cudaFree(d_a));
+    checkCudaErrors(cudaFree(d_c));

-  checkCudaErrors(cudaEventDestroy(start_event));
-  checkCudaErrors(cudaEventDestroy(stop_event));
-
-  // Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)
-  FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
-
-  checkCudaErrors(cudaFree(d_a));
-  checkCudaErrors(cudaFree(d_c));
-
-  return bResults ? EXIT_SUCCESS : EXIT_FAILURE;
+    return bResults ? EXIT_SUCCESS : EXIT_FAILURE;
 }
--- a/Samples/0_Introduction/simpleSurfaceWrite/simpleSurfaceWrite.cu
+++ b/Samples/0_Introduction/simpleSurfaceWrite/simpleSurfaceWrite.cu
@ -34,10 +34,10 @@
 */

 // Includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -49,18 +49,18 @@
 #include <cuda_runtime.h>

 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h

 // CUDA helper functions
-#include <helper_cuda.h>  // helper functions for CUDA error check
+#include <helper_cuda.h> // helper functions for CUDA error check

 #define MIN_EPSILON_ERROR 5e-3f

 ////////////////////////////////////////////////////////////////////////////////
 // Define the files that are to be save and the reference images for validation
 const char *imageFilename = "teapot512.pgm";
-const char *refFilename = "ref_rotated.pgm";
-float angle = 0.5f;  // angle to rotate image by (in radians)
+const char *refFilename   = "ref_rotated.pgm";
+float       angle         = 0.5f; // angle to rotate image by (in radians)

 // Auto-Verification Code
 bool testResult = true;
@ -73,223 +73,218 @@ static const char *sampleName = "simpleSurfaceWrite";
 //! Write to a cuArray (texture data source) using surface writes
 //! @param gIData input data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void surfaceWriteKernel(float *gIData, int width, int height,
-                                   cudaSurfaceObject_t outputSurface) {
-  // calculate surface coordinates
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void surfaceWriteKernel(float *gIData, int width, int height, cudaSurfaceObject_t outputSurface)
+{
+    // calculate surface coordinates
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

-  // read from global memory and write to cuarray (via surface reference)
-  surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y,
-              cudaBoundaryModeTrap);
+    // read from global memory and write to cuarray (via surface reference)
+    surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y, cudaBoundaryModeTrap);
 }

 ////////////////////////////////////////////////////////////////////////////////
 //! Transform an image using texture lookups
 //! @param gOData  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *gOData, int width, int height,
-                                float theta, cudaTextureObject_t tex) {
-  // calculate normalized texture coordinates
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void transformKernel(float *gOData, int width, int height, float theta, cudaTextureObject_t tex)
+{
+    // calculate normalized texture coordinates
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

-  float u = x / (float)width;
-  float v = y / (float)height;
+    float u = x / (float)width;
+    float v = y / (float)height;

-  // transform coordinates
-  u -= 0.5f;
-  v -= 0.5f;
-  float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
-  float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
+    // transform coordinates
+    u -= 0.5f;
+    v -= 0.5f;
+    float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
+    float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;

-  // read from texture and write to global memory
-  gOData[y * width + x] = tex2D<float>(tex, tu, tv);
+    // read from texture and write to global memory
+    gOData[y * width + x] = tex2D<float>(tex, tu, tv);
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Declaration, forward
 void runTest(int argc, char **argv);

-extern "C" void computeGold(float *reference, float *idata,
-                            const unsigned int len);
+extern "C" void computeGold(float *reference, float *idata, const unsigned int len);

 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  printf("%s starting...\n", sampleName);
+int main(int argc, char **argv)
+{
+    printf("%s starting...\n", sampleName);

-  // Process command-line arguments
-  if (argc > 1) {
-    if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
-      getCmdLineArgumentString(argc, (const char **)argv, "input",
-                               (char **)&imageFilename);
+    // Process command-line arguments
+    if (argc > 1) {
+        if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
+            getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);

-      if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
-        getCmdLineArgumentString(argc, (const char **)argv, "reference",
-                                 (char **)&refFilename);
-      } else {
-        printf("-input flag should be used with -reference flag");
-        exit(EXIT_FAILURE);
-      }
-    } else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
-      printf("-reference flag should be used with -input flag");
-      exit(EXIT_FAILURE);
+            if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
+                getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
+            }
+            else {
+                printf("-input flag should be used with -reference flag");
+                exit(EXIT_FAILURE);
+            }
+        }
+        else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
+            printf("-reference flag should be used with -input flag");
+            exit(EXIT_FAILURE);
+        }
    }
-  }

-  runTest(argc, argv);
+    runTest(argc, argv);

-  printf("%s completed, returned %s\n", sampleName,
-         testResult ? "OK" : "ERROR!");
-  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
-  // Use command-line specified CUDA device,
-  // otherwise use device with highest Gflops/s
-  int devID = findCudaDevice(argc, (const char **)argv);
+void runTest(int argc, char **argv)
+{
+    // Use command-line specified CUDA device,
+    // otherwise use device with highest Gflops/s
+    int devID = findCudaDevice(argc, (const char **)argv);

-  // Get number of SMs on this GPU
-  cudaDeviceProp deviceProps;
+    // Get number of SMs on this GPU
+    cudaDeviceProp deviceProps;

-  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-  printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n",
-         deviceProps.name, deviceProps.multiProcessorCount, deviceProps.major,
-         deviceProps.minor);
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n",
+           deviceProps.name,
+           deviceProps.multiProcessorCount,
+           deviceProps.major,
+           deviceProps.minor);

-  // Load image from disk
-  float *hData = NULL;
-  unsigned int width, height;
-  char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
+    // Load image from disk
+    float       *hData = NULL;
+    unsigned int width, height;
+    char        *imagePath = sdkFindFilePath(imageFilename, argv[0]);

-  if (imagePath == NULL) {
-    printf("Unable to source image input file: %s\n", imageFilename);
-    exit(EXIT_FAILURE);
-  }
+    if (imagePath == NULL) {
+        printf("Unable to source image input file: %s\n", imageFilename);
+        exit(EXIT_FAILURE);
+    }

-  sdkLoadPGM(imagePath, &hData, &width, &height);
+    sdkLoadPGM(imagePath, &hData, &width, &height);

-  unsigned int size = width * height * sizeof(float);
-  printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
+    unsigned int size = width * height * sizeof(float);
+    printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);

-  // Load reference image from image (output)
-  float *hDataRef = (float *)malloc(size);
-  char *refPath = sdkFindFilePath(refFilename, argv[0]);
+    // Load reference image from image (output)
+    float *hDataRef = (float *)malloc(size);
+    char  *refPath  = sdkFindFilePath(refFilename, argv[0]);

-  if (refPath == NULL) {
-    printf("Unable to find reference image file: %s\n", refFilename);
-    exit(EXIT_FAILURE);
-  }
+    if (refPath == NULL) {
+        printf("Unable to find reference image file: %s\n", refFilename);
+        exit(EXIT_FAILURE);
+    }

-  sdkLoadPGM(refPath, &hDataRef, &width, &height);
+    sdkLoadPGM(refPath, &hDataRef, &width, &height);

-  // Allocate device memory for result
-  float *dData = NULL;
-  checkCudaErrors(cudaMalloc((void **)&dData, size));
+    // Allocate device memory for result
+    float *dData = NULL;
+    checkCudaErrors(cudaMalloc((void **)&dData, size));

-  // Allocate array and copy image data
-  cudaChannelFormatDesc channelDesc =
-      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
-  cudaArray *cuArray;
-  checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height,
-                                  cudaArraySurfaceLoadStore));
+    // Allocate array and copy image data
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+    cudaArray            *cuArray;
+    checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height, cudaArraySurfaceLoadStore));

-  dim3 dimBlock(8, 8, 1);
-  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
+    dim3 dimBlock(8, 8, 1);
+    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

-  cudaSurfaceObject_t outputSurface;
-  cudaResourceDesc surfRes;
-  memset(&surfRes, 0, sizeof(cudaResourceDesc));
-  surfRes.resType = cudaResourceTypeArray;
-  surfRes.res.array.array = cuArray;
+    cudaSurfaceObject_t outputSurface;
+    cudaResourceDesc    surfRes;
+    memset(&surfRes, 0, sizeof(cudaResourceDesc));
+    surfRes.resType         = cudaResourceTypeArray;
+    surfRes.res.array.array = cuArray;

-  checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
+    checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
 #if 1
-  checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
-  surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height,
-                                            outputSurface);
-#else  // This is what differs from the example simpleTexture
-  checkCudaErrors(
-      cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
+    surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, outputSurface);
+#else // This is what differs from the example simpleTexture
+    checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
 #endif

-  cudaTextureObject_t tex;
-  cudaResourceDesc texRes;
-  memset(&texRes, 0, sizeof(cudaResourceDesc));
+    cudaTextureObject_t tex;
+    cudaResourceDesc    texRes;
+    memset(&texRes, 0, sizeof(cudaResourceDesc));

-  texRes.resType = cudaResourceTypeArray;
-  texRes.res.array.array = cuArray;
+    texRes.resType         = cudaResourceTypeArray;
+    texRes.res.array.array = cuArray;

-  cudaTextureDesc texDescr;
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    cudaTextureDesc texDescr;
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));

-  texDescr.normalizedCoords = true;
-  texDescr.filterMode = cudaFilterModeLinear;
-  texDescr.addressMode[0] = cudaAddressModeWrap;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
-  texDescr.readMode = cudaReadModeElementType;
+    texDescr.normalizedCoords = true;
+    texDescr.filterMode       = cudaFilterModeLinear;
+    texDescr.addressMode[0]   = cudaAddressModeWrap;
+    texDescr.addressMode[1]   = cudaAddressModeWrap;
+    texDescr.readMode         = cudaReadModeElementType;

-  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
+    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));

-  // Warmup
-  transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
+    // Warmup
+    transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);

-  checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaDeviceSynchronize());

-  StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
+    StopWatchInterface *timer = NULL;
+    sdkCreateTimer(&timer);
+    sdkStartTimer(&timer);

-  // Execute the kernel
-  transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
+    // Execute the kernel
+    transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);

-  // Check if kernel execution generated an error
-  getLastCudaError("Kernel execution failed");
+    // Check if kernel execution generated an error
+    getLastCudaError("Kernel execution failed");

-  cudaDeviceSynchronize();
-  sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  printf("%.2f Mpixels/sec\n",
-         (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
-  sdkDeleteTimer(&timer);
+    cudaDeviceSynchronize();
+    sdkStopTimer(&timer);
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
+    sdkDeleteTimer(&timer);

-  // Allocate mem for the result on host side
-  float *hOData = (float *)malloc(size);
-  // copy result from device to host
-  checkCudaErrors(cudaMemcpy(hOData, dData, size, cudaMemcpyDeviceToHost));
+    // Allocate mem for the result on host side
+    float *hOData = (float *)malloc(size);
+    // copy result from device to host
+    checkCudaErrors(cudaMemcpy(hOData, dData, size, cudaMemcpyDeviceToHost));

-  // Write result to file
-  char outputFilename[1024];
-  strcpy(outputFilename, "output.pgm");
-  sdkSavePGM("output.pgm", hOData, width, height);
-  printf("Wrote '%s'\n", outputFilename);
+    // Write result to file
+    char outputFilename[1024];
+    strcpy(outputFilename, "output.pgm");
+    sdkSavePGM("output.pgm", hOData, width, height);
+    printf("Wrote '%s'\n", outputFilename);

-  // Write regression file if necessary
-  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
-    // Write file for regression test
-    sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f,
-                        false);
-  } else {
-    // We need to reload the data from disk,
-    // because it is inverted upon output
-    sdkLoadPGM(outputFilename, &hOData, &width, &height);
+    // Write regression file if necessary
+    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
+        // Write file for regression test
+        sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f, false);
+    }
+    else {
+        // We need to reload the data from disk,
+        // because it is inverted upon output
+        sdkLoadPGM(outputFilename, &hOData, &width, &height);

-    printf("Comparing files\n");
-    printf("\toutput:    <%s>\n", outputFilename);
-    printf("\treference: <%s>\n", refPath);
-    testResult =
-        compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
-  }
+        printf("Comparing files\n");
+        printf("\toutput:    <%s>\n", outputFilename);
+        printf("\treference: <%s>\n", refPath);
+        testResult = compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
+    }

-  checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
-  checkCudaErrors(cudaDestroyTextureObject(tex));
-  checkCudaErrors(cudaFree(dData));
-  checkCudaErrors(cudaFreeArray(cuArray));
-  free(imagePath);
-  free(refPath);
+    checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
+    checkCudaErrors(cudaDestroyTextureObject(tex));
+    checkCudaErrors(cudaFree(dData));
+    checkCudaErrors(cudaFreeArray(cuArray));
+    free(imagePath);
+    free(refPath);
 }
--- a/Samples/0_Introduction/simpleTemplates/sharedmem.cuh
+++ b/Samples/0_Introduction/simpleTemplates/sharedmem.cuh
@ -68,106 +68,118 @@
 // this
 // struct by putting an undefined symbol in the function body so it won't
 // compile.
-template <typename T>
-struct SharedMemory {
-  // Ensure that we won't compile any un-specialized types
-  __device__ T *getPointer() {
-    extern __device__ void error(void);
-    error();
-    return NULL;
-  }
+template <typename T> struct SharedMemory
+{
+    // Ensure that we won't compile any un-specialized types
+    __device__ T *getPointer()
+    {
+        extern __device__ void error(void);
+        error();
+        return NULL;
+    }
 };

 // Following are the specializations for the following types.
 // int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
 // One could also specialize it for user-defined types.

-template <>
-struct SharedMemory<int> {
-  __device__ int *getPointer() {
-    extern __shared__ int s_int[];
-    return s_int;
-  }
+template <> struct SharedMemory<int>
+{
+    __device__ int *getPointer()
+    {
+        extern __shared__ int s_int[];
+        return s_int;
+    }
 };

-template <>
-struct SharedMemory<unsigned int> {
-  __device__ unsigned int *getPointer() {
-    extern __shared__ unsigned int s_uint[];
-    return s_uint;
-  }
+template <> struct SharedMemory<unsigned int>
+{
+    __device__ unsigned int *getPointer()
+    {
+        extern __shared__ unsigned int s_uint[];
+        return s_uint;
+    }
 };

-template <>
-struct SharedMemory<char> {
-  __device__ char *getPointer() {
-    extern __shared__ char s_char[];
-    return s_char;
-  }
+template <> struct SharedMemory<char>
+{
+    __device__ char *getPointer()
+    {
+        extern __shared__ char s_char[];
+        return s_char;
+    }
 };

-template <>
-struct SharedMemory<unsigned char> {
-  __device__ unsigned char *getPointer() {
-    extern __shared__ unsigned char s_uchar[];
-    return s_uchar;
-  }
+template <> struct SharedMemory<unsigned char>
+{
+    __device__ unsigned char *getPointer()
+    {
+        extern __shared__ unsigned char s_uchar[];
+        return s_uchar;
+    }
 };

-template <>
-struct SharedMemory<short> {
-  __device__ short *getPointer() {
-    extern __shared__ short s_short[];
-    return s_short;
-  }
+template <> struct SharedMemory<short>
+{
+    __device__ short *getPointer()
+    {
+        extern __shared__ short s_short[];
+        return s_short;
+    }
 };

-template <>
-struct SharedMemory<unsigned short> {
-  __device__ unsigned short *getPointer() {
-    extern __shared__ unsigned short s_ushort[];
-    return s_ushort;
-  }
+template <> struct SharedMemory<unsigned short>
+{
+    __device__ unsigned short *getPointer()
+    {
+        extern __shared__ unsigned short s_ushort[];
+        return s_ushort;
+    }
 };

-template <>
-struct SharedMemory<long> {
-  __device__ long *getPointer() {
-    extern __shared__ long s_long[];
-    return s_long;
-  }
+template <> struct SharedMemory<long>
+{
+    __device__ long *getPointer()
+    {
+        extern __shared__ long s_long[];
+        return s_long;
+    }
 };

-template <>
-struct SharedMemory<unsigned long> {
-  __device__ unsigned long *getPointer() {
-    extern __shared__ unsigned long s_ulong[];
-    return s_ulong;
-  }
+template <> struct SharedMemory<unsigned long>
+{
+    __device__ unsigned long *getPointer()
+    {
+        extern __shared__ unsigned long s_ulong[];
+        return s_ulong;
+    }
 };

-template <>
-struct SharedMemory<bool> {
-  __device__ bool *getPointer() {
-    extern __shared__ bool s_bool[];
-    return s_bool;
-  }
+template <> struct SharedMemory<bool>
+{
+    __device__ bool *getPointer()
+    {
+        extern __shared__ bool s_bool[];
+        return s_bool;
+    }
 };

-template <>
-struct SharedMemory<float> {
-  __device__ float *getPointer() {
-    extern __shared__ float s_float[];
-    return s_float;
-  }
+template <> struct SharedMemory<float>
+{
+    __device__ float *getPointer()
+    {
+        extern __shared__ float s_float[];
+        return s_float;
+    }
 };

-template <>
-struct SharedMemory<double> {
-  __device__ double *getPointer() {
-    extern __shared__ double s_double[];
-    return s_double;
-  }
+template <> struct SharedMemory<double>
+{
+    __device__ double *getPointer()
+    {
+        extern __shared__ double s_double[];
+        return s_double;
+    }
 };

-#endif  //_SHAREDMEM_H_
+#endif //_SHAREDMEM_H_
--- a/Samples/0_Introduction/simpleTemplates/simpleTemplates.cu
+++ b/Samples/0_Introduction/simpleTemplates/simpleTemplates.cu
@ -26,23 +26,23 @@
 */

 /* This sample is a templatized version of the template project.
-* It also shows how to correctly templatize dynamically allocated shared
-* memory arrays.
-* Host code.
-*/
+ * It also shows how to correctly templatize dynamically allocated shared
+ * memory arrays.
+ * Host code.
+ */

 // System includes
-#include <stdio.h>
 #include <assert.h>
-#include <string.h>
 #include <math.h>
+#include <stdio.h>
+#include <string.h>

 // CUDA runtime
 #include <cuda_runtime.h>

 // helper functions and utilities to work with CUDA
-#include <helper_functions.h>
 #include <helper_cuda.h>
+#include <helper_functions.h>

 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
@ -58,55 +58,55 @@ int g_TotalFailures = 0;
 //! @param g_idata  input data in global memory
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-template <class T>
-__global__ void testKernel(T *g_idata, T *g_odata) {
-  // Shared mem size is determined by the host app at run time
-  SharedMemory<T> smem;
-  T *sdata = smem.getPointer();
+template <class T> __global__ void testKernel(T *g_idata, T *g_odata)
+{
+    // Shared mem size is determined by the host app at run time
+    SharedMemory<T> smem;
+    T              *sdata = smem.getPointer();

-  // access thread id
-  const unsigned int tid = threadIdx.x;
-  // access number of threads in this block
-  const unsigned int num_threads = blockDim.x;
+    // access thread id
+    const unsigned int tid = threadIdx.x;
+    // access number of threads in this block
+    const unsigned int num_threads = blockDim.x;

-  // read in input data from global memory
-  sdata[tid] = g_idata[tid];
-  __syncthreads();
+    // read in input data from global memory
+    sdata[tid] = g_idata[tid];
+    __syncthreads();

-  // perform some computations
-  sdata[tid] = (T)num_threads * sdata[tid];
-  __syncthreads();
+    // perform some computations
+    sdata[tid] = (T)num_threads * sdata[tid];
+    __syncthreads();

-  // write data to global memory
-  g_odata[tid] = sdata[tid];
+    // write data to global memory
+    g_odata[tid] = sdata[tid];
 }

 ////////////////////////////////////////////////////////////////////////////////
 // declaration, forward
-template <class T>
-void runTest(int argc, char **argv, int len);
+template <class T> void runTest(int argc, char **argv, int len);

-template <class T>
-void computeGold(T *reference, T *idata, const unsigned int len) {
-  const T T_len = static_cast<T>(len);
+template <class T> void computeGold(T *reference, T *idata, const unsigned int len)
+{
+    const T T_len = static_cast<T>(len);

-  for (unsigned int i = 0; i < len; ++i) {
-    reference[i] = idata[i] * T_len;
-  }
+    for (unsigned int i = 0; i < len; ++i) {
+        reference[i] = idata[i] * T_len;
+    }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  printf("> runTest<float,32>\n");
-  runTest<float>(argc, argv, 32);
-  printf("> runTest<int,64>\n");
-  runTest<int>(argc, argv, 64);
+int main(int argc, char **argv)
+{
+    printf("> runTest<float,32>\n");
+    runTest<float>(argc, argv, 32);
+    printf("> runTest<int,64>\n");
+    runTest<int>(argc, argv, 64);

-  printf("\n[simpleTemplates] -> Test Results: %d Failures\n", g_TotalFailures);
+    printf("\n[simpleTemplates] -> Test Results: %d Failures\n", g_TotalFailures);

-  exit(g_TotalFailures == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(g_TotalFailures == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 // To completely templatize runTest (below) with cutil, we need to use
@ -114,151 +114,152 @@ int main(int argc, char **argv) {
 // functions for different types.

 // Here's the generic wrapper for cutCompare*
-template <class T>
-class ArrayComparator {
- public:
-  bool compare(const T *reference, T *data, unsigned int len) {
-    fprintf(stderr,
-            "Error: no comparison function implemented for this type\n");
-    return false;
-  }
+template <class T> class ArrayComparator
+{
+public:
+    bool compare(const T *reference, T *data, unsigned int len)
+    {
+        fprintf(stderr, "Error: no comparison function implemented for this type\n");
+        return false;
+    }
 };

 // Here's the specialization for ints:
-template <>
-class ArrayComparator<int> {
- public:
-  bool compare(const int *reference, int *data, unsigned int len) {
-    return compareData(reference, data, len, 0.15f, 0.0f);
-  }
+template <> class ArrayComparator<int>
+{
+public:
+    bool compare(const int *reference, int *data, unsigned int len)
+    {
+        return compareData(reference, data, len, 0.15f, 0.0f);
+    }
 };

 // Here's the specialization for floats:
-template <>
-class ArrayComparator<float> {
- public:
-  bool compare(const float *reference, float *data, unsigned int len) {
-    return compareData(reference, data, len, 0.15f, 0.15f);
-  }
+template <> class ArrayComparator<float>
+{
+public:
+    bool compare(const float *reference, float *data, unsigned int len)
+    {
+        return compareData(reference, data, len, 0.15f, 0.15f);
+    }
 };

 // Here's the generic wrapper for cutWriteFile*
-template <class T>
-class ArrayFileWriter {
- public:
-  bool write(const char *filename, T *data, unsigned int len, float epsilon) {
-    fprintf(stderr,
-            "Error: no file write function implemented for this type\n");
-    return false;
-  }
+template <class T> class ArrayFileWriter
+{
+public:
+    bool write(const char *filename, T *data, unsigned int len, float epsilon)
+    {
+        fprintf(stderr, "Error: no file write function implemented for this type\n");
+        return false;
+    }
 };

 // Here's the specialization for ints:
-template <>
-class ArrayFileWriter<int> {
- public:
-  bool write(const char *filename, int *data, unsigned int len, float epsilon) {
-    return sdkWriteFile(filename, data, len, epsilon, false);
-  }
+template <> class ArrayFileWriter<int>
+{
+public:
+    bool write(const char *filename, int *data, unsigned int len, float epsilon)
+    {
+        return sdkWriteFile(filename, data, len, epsilon, false);
+    }
 };

 // Here's the specialization for floats:
-template <>
-class ArrayFileWriter<float> {
- public:
-  bool write(const char *filename, float *data, unsigned int len,
-             float epsilon) {
-    return sdkWriteFile(filename, data, len, epsilon, false);
-  }
+template <> class ArrayFileWriter<float>
+{
+public:
+    bool write(const char *filename, float *data, unsigned int len, float epsilon)
+    {
+        return sdkWriteFile(filename, data, len, epsilon, false);
+    }
 };

 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-template <class T>
-void runTest(int argc, char **argv, int len) {
-  int devID;
-  cudaDeviceProp deviceProps;
+template <class T> void runTest(int argc, char **argv, int len)
+{
+    int            devID;
+    cudaDeviceProp deviceProps;

-  devID = findCudaDevice(argc, (const char **)argv);
+    devID = findCudaDevice(argc, (const char **)argv);

-  // get number of SMs on this GPU
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-  printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name,
-         deviceProps.multiProcessorCount);
+    // get number of SMs on this GPU
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);

-  // create and start timer
-  StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
+    // create and start timer
+    StopWatchInterface *timer = NULL;
+    sdkCreateTimer(&timer);

-  // start the timer
-  sdkStartTimer(&timer);
+    // start the timer
+    sdkStartTimer(&timer);

-  unsigned int num_threads = len;
-  unsigned int mem_size = sizeof(float) * num_threads;
+    unsigned int num_threads = len;
+    unsigned int mem_size    = sizeof(float) * num_threads;

-  // allocate host memory
-  T *h_idata = (T *)malloc(mem_size);
+    // allocate host memory
+    T *h_idata = (T *)malloc(mem_size);

-  // initialize the memory
-  for (unsigned int i = 0; i < num_threads; ++i) {
-    h_idata[i] = (T)i;
-  }
+    // initialize the memory
+    for (unsigned int i = 0; i < num_threads; ++i) {
+        h_idata[i] = (T)i;
+    }

-  // allocate device memory
-  T *d_idata;
-  checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
-  // copy host memory to device
-  checkCudaErrors(
-      cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
+    // allocate device memory
+    T *d_idata;
+    checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
+    // copy host memory to device
+    checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));

-  // allocate device memory for result
-  T *d_odata;
-  checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));
+    // allocate device memory for result
+    T *d_odata;
+    checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));

-  // setup execution parameters
-  dim3 grid(1, 1, 1);
-  dim3 threads(num_threads, 1, 1);
+    // setup execution parameters
+    dim3 grid(1, 1, 1);
+    dim3 threads(num_threads, 1, 1);

-  // execute the kernel
-  testKernel<T><<<grid, threads, mem_size>>>(d_idata, d_odata);
+    // execute the kernel
+    testKernel<T><<<grid, threads, mem_size>>>(d_idata, d_odata);

-  // check if kernel execution generated and error
-  getLastCudaError("Kernel execution failed");
+    // check if kernel execution generated and error
+    getLastCudaError("Kernel execution failed");

-  // allocate mem for the result on host side
-  T *h_odata = (T *)malloc(mem_size);
-  // copy result from device to host
-  checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads,
-                             cudaMemcpyDeviceToHost));
+    // allocate mem for the result on host side
+    T *h_odata = (T *)malloc(mem_size);
+    // copy result from device to host
+    checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads, cudaMemcpyDeviceToHost));

-  sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  sdkDeleteTimer(&timer);
+    sdkStopTimer(&timer);
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    sdkDeleteTimer(&timer);

-  // compute reference solution
-  T *reference = (T *)malloc(mem_size);
-  computeGold<T>(reference, h_idata, num_threads);
+    // compute reference solution
+    T *reference = (T *)malloc(mem_size);
+    computeGold<T>(reference, h_idata, num_threads);

-  ArrayComparator<T> comparator;
-  ArrayFileWriter<T> writer;
+    ArrayComparator<T> comparator;
+    ArrayFileWriter<T> writer;

-  // check result
-  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
-    // write file for regression test
-    writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);
-  } else {
-    // custom output handling when no regression test running
-    // in this case check if the result is equivalent to the expected solution
-    bool res = comparator.compare(reference, h_odata, num_threads);
-    printf("Compare %s\n\n", (1 == res) ? "OK" : "MISMATCH");
-    g_TotalFailures += (1 != res);
-  }
+    // check result
+    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
+        // write file for regression test
+        writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);
+    }
+    else {
+        // custom output handling when no regression test running
+        // in this case check if the result is equivalent to the expected solution
+        bool res = comparator.compare(reference, h_odata, num_threads);
+        printf("Compare %s\n\n", (1 == res) ? "OK" : "MISMATCH");
+        g_TotalFailures += (1 != res);
+    }

-  // cleanup memory
-  free(h_idata);
-  free(h_odata);
-  free(reference);
-  checkCudaErrors(cudaFree(d_idata));
-  checkCudaErrors(cudaFree(d_odata));
+    // cleanup memory
+    free(h_idata);
+    free(h_odata);
+    free(reference);
+    checkCudaErrors(cudaFree(d_idata));
+    checkCudaErrors(cudaFree(d_odata));
 }
--- a/Samples/0_Introduction/simpleTexture/simpleTexture.cu
+++ b/Samples/0_Introduction/simpleTexture/simpleTexture.cu
@ -34,10 +34,10 @@
 */

 // Includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -49,22 +49,22 @@
 #include <cuda_runtime.h>

 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h

 // CUDA helper functions
-#include <helper_cuda.h>  // helper functions for CUDA error check
+#include <helper_cuda.h> // helper functions for CUDA error check

 #define MAX_EPSILON_ERROR 5e-3f

 // Define the files that are to be save and the reference images for validation
 const char *imageFilename = "teapot512.pgm";
-const char *refFilename = "ref_rotated.pgm";
+const char *refFilename   = "ref_rotated.pgm";

 const char *sampleName = "simpleTexture";

 ////////////////////////////////////////////////////////////////////////////////
 // Constants
-const float angle = 0.5f;  // angle to rotate image by (in radians)
+const float angle = 0.5f; // angle to rotate image by (in radians)

 // Auto-Verification Code
 bool testResult = true;
@ -73,22 +73,22 @@ bool testResult = true;
 //! Transform an image using texture lookups
 //! @param outputData  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *outputData, int width, int height,
-                                float theta, cudaTextureObject_t tex) {
-  // calculate normalized texture coordinates
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void transformKernel(float *outputData, int width, int height, float theta, cudaTextureObject_t tex)
+{
+    // calculate normalized texture coordinates
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

-  float u = (float)x - (float)width / 2;
-  float v = (float)y - (float)height / 2;
-  float tu = u * cosf(theta) - v * sinf(theta);
-  float tv = v * cosf(theta) + u * sinf(theta);
+    float u  = (float)x - (float)width / 2;
+    float v  = (float)y - (float)height / 2;
+    float tu = u * cosf(theta) - v * sinf(theta);
+    float tv = v * cosf(theta) + u * sinf(theta);

-  tu /= (float)width;
-  tv /= (float)height;
+    tu /= (float)width;
+    tv /= (float)height;

-  // read from texture and write to global memory
-  outputData[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
+    // read from texture and write to global memory
+    outputData[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -98,154 +98,151 @@ void runTest(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  printf("%s starting...\n", sampleName);
+int main(int argc, char **argv)
+{
+    printf("%s starting...\n", sampleName);

-  // Process command-line arguments
-  if (argc > 1) {
-    if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
-      getCmdLineArgumentString(argc, (const char **)argv, "input",
-                               (char **)&imageFilename);
+    // Process command-line arguments
+    if (argc > 1) {
+        if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
+            getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);

-      if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
-        getCmdLineArgumentString(argc, (const char **)argv, "reference",
-                                 (char **)&refFilename);
-      } else {
-        printf("-input flag should be used with -reference flag");
-        exit(EXIT_FAILURE);
-      }
-    } else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
-      printf("-reference flag should be used with -input flag");
-      exit(EXIT_FAILURE);
+            if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
+                getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
+            }
+            else {
+                printf("-input flag should be used with -reference flag");
+                exit(EXIT_FAILURE);
+            }
+        }
+        else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
+            printf("-reference flag should be used with -input flag");
+            exit(EXIT_FAILURE);
+        }
    }
-  }

-  runTest(argc, argv);
+    runTest(argc, argv);

-  printf("%s completed, returned %s\n", sampleName,
-         testResult ? "OK" : "ERROR!");
-  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
-  int devID = findCudaDevice(argc, (const char **)argv);
+void runTest(int argc, char **argv)
+{
+    int devID = findCudaDevice(argc, (const char **)argv);

-  // load image from disk
-  float *hData = NULL;
-  unsigned int width, height;
-  char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
+    // load image from disk
+    float       *hData = NULL;
+    unsigned int width, height;
+    char        *imagePath = sdkFindFilePath(imageFilename, argv[0]);

-  if (imagePath == NULL) {
-    printf("Unable to source image file: %s\n", imageFilename);
-    exit(EXIT_FAILURE);
-  }
+    if (imagePath == NULL) {
+        printf("Unable to source image file: %s\n", imageFilename);
+        exit(EXIT_FAILURE);
+    }

-  sdkLoadPGM(imagePath, &hData, &width, &height);
+    sdkLoadPGM(imagePath, &hData, &width, &height);

-  unsigned int size = width * height * sizeof(float);
-  printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
+    unsigned int size = width * height * sizeof(float);
+    printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);

-  // Load reference image from image (output)
-  float *hDataRef = (float *)malloc(size);
-  char *refPath = sdkFindFilePath(refFilename, argv[0]);
+    // Load reference image from image (output)
+    float *hDataRef = (float *)malloc(size);
+    char  *refPath  = sdkFindFilePath(refFilename, argv[0]);

-  if (refPath == NULL) {
-    printf("Unable to find reference image file: %s\n", refFilename);
-    exit(EXIT_FAILURE);
-  }
+    if (refPath == NULL) {
+        printf("Unable to find reference image file: %s\n", refFilename);
+        exit(EXIT_FAILURE);
+    }

-  sdkLoadPGM(refPath, &hDataRef, &width, &height);
+    sdkLoadPGM(refPath, &hDataRef, &width, &height);

-  // Allocate device memory for result
-  float *dData = NULL;
-  checkCudaErrors(cudaMalloc((void **)&dData, size));
+    // Allocate device memory for result
+    float *dData = NULL;
+    checkCudaErrors(cudaMalloc((void **)&dData, size));

-  // Allocate array and copy image data
-  cudaChannelFormatDesc channelDesc =
-      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
-  cudaArray *cuArray;
-  checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height));
-  checkCudaErrors(
-      cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
+    // Allocate array and copy image data
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+    cudaArray            *cuArray;
+    checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height));
+    checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));

-  cudaTextureObject_t tex;
-  cudaResourceDesc texRes;
-  memset(&texRes, 0, sizeof(cudaResourceDesc));
+    cudaTextureObject_t tex;
+    cudaResourceDesc    texRes;
+    memset(&texRes, 0, sizeof(cudaResourceDesc));

-  texRes.resType = cudaResourceTypeArray;
-  texRes.res.array.array = cuArray;
+    texRes.resType         = cudaResourceTypeArray;
+    texRes.res.array.array = cuArray;

-  cudaTextureDesc texDescr;
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    cudaTextureDesc texDescr;
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));

-  texDescr.normalizedCoords = true;
-  texDescr.filterMode = cudaFilterModeLinear;
-  texDescr.addressMode[0] = cudaAddressModeWrap;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
-  texDescr.readMode = cudaReadModeElementType;
+    texDescr.normalizedCoords = true;
+    texDescr.filterMode       = cudaFilterModeLinear;
+    texDescr.addressMode[0]   = cudaAddressModeWrap;
+    texDescr.addressMode[1]   = cudaAddressModeWrap;
+    texDescr.readMode         = cudaReadModeElementType;

-  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
+    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));

-  dim3 dimBlock(8, 8, 1);
-  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
+    dim3 dimBlock(8, 8, 1);
+    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

-  // Warmup
-  transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
+    // Warmup
+    transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);

-  checkCudaErrors(cudaDeviceSynchronize());
-  StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
+    checkCudaErrors(cudaDeviceSynchronize());
+    StopWatchInterface *timer = NULL;
+    sdkCreateTimer(&timer);
+    sdkStartTimer(&timer);

-  // Execute the kernel
-  transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
+    // Execute the kernel
+    transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);

-  // Check if kernel execution generated an error
-  getLastCudaError("Kernel execution failed");
+    // Check if kernel execution generated an error
+    getLastCudaError("Kernel execution failed");

-  checkCudaErrors(cudaDeviceSynchronize());
-  sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  printf("%.2f Mpixels/sec\n",
-         (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
-  sdkDeleteTimer(&timer);
+    checkCudaErrors(cudaDeviceSynchronize());
+    sdkStopTimer(&timer);
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
+    sdkDeleteTimer(&timer);

-  // Allocate mem for the result on host side
-  float *hOutputData = (float *)malloc(size);
-  // copy result from device to host
-  checkCudaErrors(cudaMemcpy(hOutputData, dData, size, cudaMemcpyDeviceToHost));
+    // Allocate mem for the result on host side
+    float *hOutputData = (float *)malloc(size);
+    // copy result from device to host
+    checkCudaErrors(cudaMemcpy(hOutputData, dData, size, cudaMemcpyDeviceToHost));

-  // Write result to file
-  char outputFilename[1024];
-  strcpy(outputFilename, imagePath);
-  strcpy(outputFilename + strlen(imagePath) - 4, "_out.pgm");
-  sdkSavePGM(outputFilename, hOutputData, width, height);
-  printf("Wrote '%s'\n", outputFilename);
+    // Write result to file
+    char outputFilename[1024];
+    strcpy(outputFilename, imagePath);
+    strcpy(outputFilename + strlen(imagePath) - 4, "_out.pgm");
+    sdkSavePGM(outputFilename, hOutputData, width, height);
+    printf("Wrote '%s'\n", outputFilename);

-  // Write regression file if necessary
-  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
-    // Write file for regression test
-    sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height,
-                        0.0f, false);
-  } else {
-    // We need to reload the data from disk,
-    // because it is inverted upon output
-    sdkLoadPGM(outputFilename, &hOutputData, &width, &height);
+    // Write regression file if necessary
+    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
+        // Write file for regression test
+        sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height, 0.0f, false);
+    }
+    else {
+        // We need to reload the data from disk,
+        // because it is inverted upon output
+        sdkLoadPGM(outputFilename, &hOutputData, &width, &height);

-    printf("Comparing files\n");
-    printf("\toutput:    <%s>\n", outputFilename);
-    printf("\treference: <%s>\n", refPath);
+        printf("Comparing files\n");
+        printf("\toutput:    <%s>\n", outputFilename);
+        printf("\treference: <%s>\n", refPath);

-    testResult = compareData(hOutputData, hDataRef, width * height,
-                             MAX_EPSILON_ERROR, 0.15f);
-  }
+        testResult = compareData(hOutputData, hDataRef, width * height, MAX_EPSILON_ERROR, 0.15f);
+    }

-  checkCudaErrors(cudaDestroyTextureObject(tex));
-  checkCudaErrors(cudaFree(dData));
-  checkCudaErrors(cudaFreeArray(cuArray));
-  free(imagePath);
-  free(refPath);
+    checkCudaErrors(cudaDestroyTextureObject(tex));
+    checkCudaErrors(cudaFree(dData));
+    checkCudaErrors(cudaFreeArray(cuArray));
+    free(imagePath);
+    free(refPath);
 }
--- a/Samples/0_Introduction/simpleTexture3D/README.md
+++ b/Samples/0_Introduction/simpleTexture3D/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
-
--- a/Samples/0_Introduction/simpleTexture3D/simpleTexture3D.cpp
+++ b/Samples/0_Introduction/simpleTexture3D/simpleTexture3D.cpp
@ -32,11 +32,11 @@
  using 3D texture lookups.
 */

-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
 #include <helper_gl.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 #if defined(__APPLE__) || defined(MACOSX)
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
@ -49,53 +49,52 @@
 #endif

 // includes, cuda
-#include <vector_types.h>
-#include <cuda_runtime.h>
 #include <cuda_gl_interop.h>
+#include <cuda_runtime.h>
+#include <vector_types.h>

 // CUDA utilities and system includes
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #include <vector_types.h>

-typedef unsigned int uint;
+typedef unsigned int  uint;
 typedef unsigned char uchar;

 #define MAX_EPSILON_ERROR 5.0f
-#define THRESHOLD 0.15f
+#define THRESHOLD         0.15f

 const char *sSDKsample = "simpleTexture3D";

-const char *volumeFilename = "Bucky.raw";
-const cudaExtent volumeSize = make_cudaExtent(32, 32, 32);
+const char      *volumeFilename = "Bucky.raw";
+const cudaExtent volumeSize     = make_cudaExtent(32, 32, 32);

 const uint width = 512, height = 512;
 const dim3 blockSize(16, 16, 1);
 const dim3 gridSize(width / blockSize.x, height / blockSize.y);

-float w = 0.5;  // texture coordinate in z
+float w = 0.5; // texture coordinate in z

-GLuint pbo;  // OpenGL pixel buffer object
-struct cudaGraphicsResource
-    *cuda_pbo_resource;  // CUDA Graphics Resource (to transfer PBO)
+GLuint                       pbo;               // OpenGL pixel buffer object
+struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)

 bool linearFiltering = true;
-bool animate = true;
+bool animate         = true;

 StopWatchInterface *timer = NULL;

 uint *d_output = NULL;

 // Auto-Verification Code
-const int frameCheckNumber = 4;
-int fpsCount = 0;  // FPS count for averaging
-int fpsLimit = 1;  // FPS limit for sampling
-int g_Index = 0;
-unsigned int frameCount = 0;
-unsigned int g_TotalErrors = 0;
+const int    frameCheckNumber  = 4;
+int          fpsCount          = 0; // FPS count for averaging
+int          fpsLimit          = 1; // FPS limit for sampling
+int          g_Index           = 0;
+unsigned int frameCount        = 0;
+unsigned int g_TotalErrors     = 0;
 volatile int g_GraphicsMapFlag = 0;

-int *pArgc = NULL;
+int   *pArgc = NULL;
 char **pArgv = NULL;

 #ifndef MAX
@ -105,288 +104,294 @@ char **pArgv = NULL;
 extern "C" void cleanup();
 extern "C" void setTextureFilterMode(bool bLinearFilter);
 extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
-extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output,
-                              uint imageW, uint imageH, float w);
-extern void cleanupCuda();
+extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w);
+extern void     cleanupCuda();

 void loadVolumeData(char *exec_path);

-void computeFPS() {
-  frameCount++;
-  fpsCount++;
+void computeFPS()
+{
+    frameCount++;
+    fpsCount++;

-  if (fpsCount == fpsLimit) {
-    char fps[256];
-    float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
-    sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps);
+    if (fpsCount == fpsLimit) {
+        char  fps[256];
+        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
+        sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps);

-    glutSetWindowTitle(fps);
-    fpsCount = 0;
+        glutSetWindowTitle(fps);
+        fpsCount = 0;

-    fpsLimit = ftoi(MAX(1.0f, ifps));
-    sdkResetTimer(&timer);
-  }
+        fpsLimit = ftoi(MAX(1.0f, ifps));
+        sdkResetTimer(&timer);
+    }
 }

 // render image using CUDA
-void render() {
-  // map PBO to get CUDA device pointer
-  g_GraphicsMapFlag++;
-  checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
-  size_t num_bytes;
-  checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
-      (void **)&d_output, &num_bytes, cuda_pbo_resource));
-  // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);
+void render()
+{
+    // map PBO to get CUDA device pointer
+    g_GraphicsMapFlag++;
+    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
+    size_t num_bytes;
+    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource));
+    // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);

-  // call CUDA kernel, writing results to PBO
-  render_kernel(gridSize, blockSize, d_output, width, height, w);
+    // call CUDA kernel, writing results to PBO
+    render_kernel(gridSize, blockSize, d_output, width, height, w);

-  getLastCudaError("render_kernel failed");
+    getLastCudaError("render_kernel failed");

-  if (g_GraphicsMapFlag) {
-    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
-    g_GraphicsMapFlag--;
-  }
+    if (g_GraphicsMapFlag) {
+        checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
+        g_GraphicsMapFlag--;
+    }
 }

 // display results using OpenGL (called by GLUT)
-void display() {
-  sdkStartTimer(&timer);
+void display()
+{
+    sdkStartTimer(&timer);

-  render();
+    render();

-  // display results
-  glClear(GL_COLOR_BUFFER_BIT);
+    // display results
+    glClear(GL_COLOR_BUFFER_BIT);

-  // draw image from PBO
-  glDisable(GL_DEPTH_TEST);
-  glRasterPos2i(0, 0);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
-  glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+    // draw image from PBO
+    glDisable(GL_DEPTH_TEST);
+    glRasterPos2i(0, 0);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+    glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

-  glutSwapBuffers();
-  glutReportErrors();
+    glutSwapBuffers();
+    glutReportErrors();

-  sdkStopTimer(&timer);
-  computeFPS();
+    sdkStopTimer(&timer);
+    computeFPS();
 }

-void idle() {
-  if (animate) {
-    w += 0.01f;
-    glutPostRedisplay();
-  }
+void idle()
+{
+    if (animate) {
+        w += 0.01f;
+        glutPostRedisplay();
+    }
 }

-void keyboard(unsigned char key, int x, int y) {
-  switch (key) {
+void keyboard(unsigned char key, int x, int y)
+{
+    switch (key) {
    case 27:
 #if defined(__APPLE__) || defined(MACOSX)
-      exit(EXIT_SUCCESS);
-      glutDestroyWindow(glutGetWindow());
-      return;
+        exit(EXIT_SUCCESS);
+        glutDestroyWindow(glutGetWindow());
+        return;
 #else
-      glutDestroyWindow(glutGetWindow());
-      return;
+        glutDestroyWindow(glutGetWindow());
+        return;
 #endif

    case '=':
    case '+':
-      w += 0.01f;
-      break;
+        w += 0.01f;
+        break;

    case '-':
-      w -= 0.01f;
-      break;
+        w -= 0.01f;
+        break;

    case 'f':
-      linearFiltering = !linearFiltering;
-      setTextureFilterMode(linearFiltering);
-      break;
+        linearFiltering = !linearFiltering;
+        setTextureFilterMode(linearFiltering);
+        break;

    case ' ':
-      animate = !animate;
-      break;
+        animate = !animate;
+        break;

    default:
-      break;
-  }
+        break;
+    }

-  glutPostRedisplay();
+    glutPostRedisplay();
 }

-void reshape(int x, int y) {
-  glViewport(0, 0, x, y);
+void reshape(int x, int y)
+{
+    glViewport(0, 0, x, y);

-  glMatrixMode(GL_MODELVIEW);
-  glLoadIdentity();
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();

-  glMatrixMode(GL_PROJECTION);
-  glLoadIdentity();
-  glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
 }

-void cleanup() {
-  sdkDeleteTimer(&timer);
+void cleanup()
+{
+    sdkDeleteTimer(&timer);

-  // add extra check to unmap the resource before unregistering it
-  if (g_GraphicsMapFlag) {
-    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
-    g_GraphicsMapFlag--;
-  }
+    // add extra check to unmap the resource before unregistering it
+    if (g_GraphicsMapFlag) {
+        checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
+        g_GraphicsMapFlag--;
+    }

-  // unregister this buffer object from CUDA C
-  checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource));
-  glDeleteBuffers(1, &pbo);
-  cleanupCuda();
+    // unregister this buffer object from CUDA C
+    checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource));
+    glDeleteBuffers(1, &pbo);
+    cleanupCuda();
 }

-void initGLBuffers() {
-  // create pixel buffer object
-  glGenBuffers(1, &pbo);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
-  glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4,
-               0, GL_STREAM_DRAW_ARB);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+void initGLBuffers()
+{
+    // create pixel buffer object
+    glGenBuffers(1, &pbo);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+    glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, 0, GL_STREAM_DRAW_ARB);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

-  // register this buffer object with CUDA
-  checkCudaErrors(cudaGraphicsGLRegisterBuffer(
-      &cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
+    // register this buffer object with CUDA
+    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
 }

 // Load raw data from disk
-uchar *loadRawFile(const char *filename, size_t size) {
-  FILE *fp = fopen(filename, "rb");
+uchar *loadRawFile(const char *filename, size_t size)
+{
+    FILE *fp = fopen(filename, "rb");

-  if (!fp) {
-    fprintf(stderr, "Error opening file '%s'\n", filename);
-    return 0;
-  }
+    if (!fp) {
+        fprintf(stderr, "Error opening file '%s'\n", filename);
+        return 0;
+    }

-  uchar *data = (uchar *)malloc(size);
-  size_t read = fread(data, 1, size, fp);
-  fclose(fp);
+    uchar *data = (uchar *)malloc(size);
+    size_t read = fread(data, 1, size, fp);
+    fclose(fp);

-  printf("Read '%s', %zu bytes\n", filename, read);
+    printf("Read '%s', %zu bytes\n", filename, read);

-  return data;
+    return data;
 }

-void initGL(int *argc, char **argv) {
-  // initialize GLUT callback functions
-  glutInit(argc, argv);
-  glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
-  glutInitWindowSize(width, height);
-  glutCreateWindow("CUDA 3D texture");
-  glutDisplayFunc(display);
-  glutKeyboardFunc(keyboard);
-  glutReshapeFunc(reshape);
-  glutIdleFunc(idle);
+void initGL(int *argc, char **argv)
+{
+    // initialize GLUT callback functions
+    glutInit(argc, argv);
+    glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
+    glutInitWindowSize(width, height);
+    glutCreateWindow("CUDA 3D texture");
+    glutDisplayFunc(display);
+    glutKeyboardFunc(keyboard);
+    glutReshapeFunc(reshape);
+    glutIdleFunc(idle);

-  if (!isGLVersionSupported(2, 0) ||
-      !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
-    fprintf(stderr, "Required OpenGL extensions are missing.");
-    exit(EXIT_FAILURE);
-  }
+    if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
+        fprintf(stderr, "Required OpenGL extensions are missing.");
+        exit(EXIT_FAILURE);
+    }
 }

-void runAutoTest(const char *ref_file, char *exec_path) {
-  checkCudaErrors(
-      cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));
+void runAutoTest(const char *ref_file, char *exec_path)
+{
+    checkCudaErrors(cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));

-  // render the volumeData
-  render_kernel(gridSize, blockSize, d_output, width, height, w);
+    // render the volumeData
+    render_kernel(gridSize, blockSize, d_output, width, height, w);

-  checkCudaErrors(cudaDeviceSynchronize());
-  getLastCudaError("render_kernel failed");
+    checkCudaErrors(cudaDeviceSynchronize());
+    getLastCudaError("render_kernel failed");

-  void *h_output = malloc(width * height * sizeof(GLubyte) * 4);
-  checkCudaErrors(cudaMemcpy(h_output, d_output,
-                             width * height * sizeof(GLubyte) * 4,
-                             cudaMemcpyDeviceToHost));
-  sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4,
-             "simpleTexture3D.bin");
+    void *h_output = malloc(width * height * sizeof(GLubyte) * 4);
+    checkCudaErrors(cudaMemcpy(h_output, d_output, width * height * sizeof(GLubyte) * 4, cudaMemcpyDeviceToHost));
+    sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4, "simpleTexture3D.bin");

-  bool bTestResult = sdkCompareBin2BinFloat(
-      "simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path),
-      width * height, MAX_EPSILON_ERROR, THRESHOLD, exec_path);
+    bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin",
+                                              sdkFindFilePath(ref_file, exec_path),
+                                              width * height,
+                                              MAX_EPSILON_ERROR,
+                                              THRESHOLD,
+                                              exec_path);

-  checkCudaErrors(cudaFree(d_output));
-  free(h_output);
+    checkCudaErrors(cudaFree(d_output));
+    free(h_output);

-  sdkStopTimer(&timer);
-  sdkDeleteTimer(&timer);
+    sdkStopTimer(&timer);
+    sdkDeleteTimer(&timer);

-  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }

-void loadVolumeData(char *exec_path) {
-  // load volume data
-  const char *path = sdkFindFilePath(volumeFilename, exec_path);
+void loadVolumeData(char *exec_path)
+{
+    // load volume data
+    const char *path = sdkFindFilePath(volumeFilename, exec_path);

-  if (path == NULL) {
-    fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n",
-            volumeFilename);
-    exit(EXIT_FAILURE);
-  }
+    if (path == NULL) {
+        fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
+        exit(EXIT_FAILURE);
+    }

-  size_t size = volumeSize.width * volumeSize.height * volumeSize.depth;
-  uchar *h_volume = loadRawFile(path, size);
+    size_t size     = volumeSize.width * volumeSize.height * volumeSize.depth;
+    uchar *h_volume = loadRawFile(path, size);

-  initCuda(h_volume, volumeSize);
-  sdkCreateTimer(&timer);
+    initCuda(h_volume, volumeSize);
+    sdkCreateTimer(&timer);

-  free(h_volume);
+    free(h_volume);
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  pArgc = &argc;
-  pArgv = argv;
+int main(int argc, char **argv)
+{
+    pArgc = &argc;
+    pArgv = argv;

-  char *ref_file = NULL;
+    char *ref_file = NULL;

 #if defined(__linux__)
-  setenv("DISPLAY", ":0", 0);
+    setenv("DISPLAY", ":0", 0);
 #endif

-  printf("%s Starting...\n\n", sSDKsample);
+    printf("%s Starting...\n\n", sSDKsample);

-  if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
-    fpsLimit = frameCheckNumber;
-    getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
-  }
+    if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
+        fpsLimit = frameCheckNumber;
+        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
+    }

-  // use command-line specified CUDA device, otherwise use device with highest
-  // Gflops/s
-  findCudaDevice(argc, (const char **)argv);
+    // use command-line specified CUDA device, otherwise use device with highest
+    // Gflops/s
+    findCudaDevice(argc, (const char **)argv);

-  if (ref_file) {
-    loadVolumeData(argv[0]);
-    runAutoTest(ref_file, argv[0]);
-  } else {
-    initGL(&argc, argv);
+    if (ref_file) {
+        loadVolumeData(argv[0]);
+        runAutoTest(ref_file, argv[0]);
+    }
+    else {
+        initGL(&argc, argv);

-    // OpenGL buffers
-    initGLBuffers();
+        // OpenGL buffers
+        initGLBuffers();

-    loadVolumeData(argv[0]);
-  }
+        loadVolumeData(argv[0]);
+    }

-  printf(
-      "Press space to toggle animation\n"
-      "Press '+' and '-' to change displayed slice\n");
+    printf("Press space to toggle animation\n"
+           "Press '+' and '-' to change displayed slice\n");

 #if defined(__APPLE__) || defined(MACOSX)
-  atexit(cleanup);
+    atexit(cleanup);
 #else
-  glutCloseFunc(cleanup);
+    glutCloseFunc(cleanup);
 #endif

-  glutMainLoop();
+    glutMainLoop();

-  exit(EXIT_SUCCESS);
+    exit(EXIT_SUCCESS);
 }
--- a/Samples/0_Introduction/simpleTexture3D/simpleTexture3D_kernel.cu
+++ b/Samples/0_Introduction/simpleTexture3D/simpleTexture3D_kernel.cu
@ -28,111 +28,111 @@
 #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
 #define _SIMPLETEXTURE3D_KERNEL_CU_

-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-
 #include <helper_cuda.h>
 #include <helper_math.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

-typedef unsigned int uint;
+typedef unsigned int  uint;
 typedef unsigned char uchar;

-cudaArray *d_volumeArray = 0;
-cudaTextureObject_t tex;  // 3D texture
+cudaArray          *d_volumeArray = 0;
+cudaTextureObject_t tex; // 3D texture

-__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w,
-                         cudaTextureObject_t texObj) {
-  uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
-  uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
+__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w, cudaTextureObject_t texObj)
+{
+    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;

-  float u = x / (float)imageW;
-  float v = y / (float)imageH;
-  // read from 3D texture
-  float voxel = tex3D<float>(texObj, u, v, w);
+    float u = x / (float)imageW;
+    float v = y / (float)imageH;
+    // read from 3D texture
+    float voxel = tex3D<float>(texObj, u, v, w);

-  if ((x < imageW) && (y < imageH)) {
-    // write output color
-    uint i = __umul24(y, imageW) + x;
-    d_output[i] = voxel * 255;
-  }
+    if ((x < imageW) && (y < imageH)) {
+        // write output color
+        uint i      = __umul24(y, imageW) + x;
+        d_output[i] = voxel * 255;
+    }
 }

-extern "C" void setTextureFilterMode(bool bLinearFilter) {
-  if (tex) {
-    checkCudaErrors(cudaDestroyTextureObject(tex));
-  }
-  cudaResourceDesc texRes;
-  memset(&texRes, 0, sizeof(cudaResourceDesc));
+extern "C" void setTextureFilterMode(bool bLinearFilter)
+{
+    if (tex) {
+        checkCudaErrors(cudaDestroyTextureObject(tex));
+    }
+    cudaResourceDesc texRes;
+    memset(&texRes, 0, sizeof(cudaResourceDesc));

-  texRes.resType = cudaResourceTypeArray;
-  texRes.res.array.array = d_volumeArray;
+    texRes.resType         = cudaResourceTypeArray;
+    texRes.res.array.array = d_volumeArray;

-  cudaTextureDesc texDescr;
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    cudaTextureDesc texDescr;
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));

-  texDescr.normalizedCoords = true;
-  texDescr.filterMode =
-      bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
-  ;
-  texDescr.addressMode[0] = cudaAddressModeWrap;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
-  texDescr.addressMode[2] = cudaAddressModeWrap;
-  texDescr.readMode = cudaReadModeNormalizedFloat;
+    texDescr.normalizedCoords = true;
+    texDescr.filterMode       = bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
+    ;
+    texDescr.addressMode[0] = cudaAddressModeWrap;
+    texDescr.addressMode[1] = cudaAddressModeWrap;
+    texDescr.addressMode[2] = cudaAddressModeWrap;
+    texDescr.readMode       = cudaReadModeNormalizedFloat;

-  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
+    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 }

-extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) {
-  // create 3D array
-  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
-  checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
+extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize)
+{
+    // create 3D array
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
+    checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));

-  // copy data to 3D array
-  cudaMemcpy3DParms copyParams = {0};
-  copyParams.srcPtr =
-      make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar),
-                          volumeSize.width, volumeSize.height);
-  copyParams.dstArray = d_volumeArray;
-  copyParams.extent = volumeSize;
-  copyParams.kind = cudaMemcpyHostToDevice;
-  checkCudaErrors(cudaMemcpy3D(&copyParams));
+    // copy data to 3D array
+    cudaMemcpy3DParms copyParams = {0};
+    copyParams.srcPtr =
+        make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar), volumeSize.width, volumeSize.height);
+    copyParams.dstArray = d_volumeArray;
+    copyParams.extent   = volumeSize;
+    copyParams.kind     = cudaMemcpyHostToDevice;
+    checkCudaErrors(cudaMemcpy3D(&copyParams));

-  cudaResourceDesc texRes;
-  memset(&texRes, 0, sizeof(cudaResourceDesc));
+    cudaResourceDesc texRes;
+    memset(&texRes, 0, sizeof(cudaResourceDesc));

-  texRes.resType = cudaResourceTypeArray;
-  texRes.res.array.array = d_volumeArray;
+    texRes.resType         = cudaResourceTypeArray;
+    texRes.res.array.array = d_volumeArray;

-  cudaTextureDesc texDescr;
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    cudaTextureDesc texDescr;
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));

-  // access with normalized texture coordinates
-  texDescr.normalizedCoords = true;
-  // linear interpolation
-  texDescr.filterMode = cudaFilterModeLinear;
-  // wrap texture coordinates
-  texDescr.addressMode[0] = cudaAddressModeWrap;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
-  texDescr.addressMode[2] = cudaAddressModeWrap;
-  texDescr.readMode = cudaReadModeNormalizedFloat;
+    // access with normalized texture coordinates
+    texDescr.normalizedCoords = true;
+    // linear interpolation
+    texDescr.filterMode = cudaFilterModeLinear;
+    // wrap texture coordinates
+    texDescr.addressMode[0] = cudaAddressModeWrap;
+    texDescr.addressMode[1] = cudaAddressModeWrap;
+    texDescr.addressMode[2] = cudaAddressModeWrap;
+    texDescr.readMode       = cudaReadModeNormalizedFloat;

-  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
+    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 }

-extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output,
-                              uint imageW, uint imageH, float w) {
-  d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex);
+extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w)
+{
+    d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex);
 }

-void cleanupCuda() {
-  if (tex) {
-    checkCudaErrors(cudaDestroyTextureObject(tex));
-  }
-  if (d_volumeArray) {
-    checkCudaErrors(cudaFreeArray(d_volumeArray));
-  }
+void cleanupCuda()
+{
+    if (tex) {
+        checkCudaErrors(cudaDestroyTextureObject(tex));
+    }
+    if (d_volumeArray) {
+        checkCudaErrors(cudaFreeArray(d_volumeArray));
+    }
 }

-#endif  // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
+#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
--- a/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp
+++ b/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp
@ -26,29 +26,29 @@
 */

 /*
-* This sample demonstrates how use texture fetches in CUDA
-*
-* This sample takes an input PGM image (image_filename) and generates
-* an output PGM image (image_filename_out).  This CUDA kernel performs
-* a simple 2D transform (rotation) on the texture coordinates (u,v).
-* The results between simpleTexture and simpleTextureDrv are identical.
-* The main difference is the implementation.  simpleTextureDrv makes calls
-* to the CUDA driver API and demonstrates how to use cuModuleLoad to load
-* the CUDA ptx (*.ptx) kernel just prior to kernel launch.
-*
-*/
+ * This sample demonstrates how use texture fetches in CUDA
+ *
+ * This sample takes an input PGM image (image_filename) and generates
+ * an output PGM image (image_filename_out).  This CUDA kernel performs
+ * a simple 2D transform (rotation) on the texture coordinates (u,v).
+ * The results between simpleTexture and simpleTextureDrv are identical.
+ * The main difference is the implementation.  simpleTextureDrv makes calls
+ * to the CUDA driver API and demonstrates how to use cuModuleLoad to load
+ * the CUDA ptx (*.ptx) kernel just prior to kernel launch.
+ *
+ */

 // includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <iostream>
 #include <cstring>
+#include <iostream>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 // includes, CUDA
-#include <cuda.h>
 #include <builtin_types.h>
+#include <cuda.h>
 // includes, project
 #include <helper_cuda_drvapi.h>
 #include <helper_functions.h>
@ -56,8 +56,8 @@
 using namespace std;

 const char *image_filename = "teapot512.pgm";
-const char *ref_filename = "ref_rotated.pgm";
-float angle = 0.5f;  // angle to rotate image by (in radians)
+const char *ref_filename   = "ref_rotated.pgm";
+float       angle          = 0.5f; // angle to rotate image by (in radians)

 #define MIN_EPSILON_ERROR 5e-3f

@ -65,8 +65,7 @@ float angle = 0.5f;  // angle to rotate image by (in radians)
 // declaration, forward
 void runTest(int argc, char **argv);

-extern "C" void computeGold(float *reference, float *idata,
-                            const unsigned int len);
+extern "C" void computeGold(float *reference, float *idata, const unsigned int len);

 static CUresult initCUDA(int argc, char **argv, CUfunction *);

@ -80,212 +79,227 @@ const char *sSDKsample = "simpleTextureDrv (Driver API)";
 ////////////////////////////////////////////////////////////////////////////////
 // Globals
 ////////////////////////////////////////////////////////////////////////////////
-CUdevice cuDevice;
+CUdevice  cuDevice;
 CUcontext cuContext;
-CUmodule cuModule;
+CUmodule  cuModule;

-void showHelp() {
-  printf("\n> [%s] Command line options\n", sSDKsample);
-  printf("\t-device=n          (where n=0,1,2.... for the GPU device)\n\n");
+void showHelp()
+{
+    printf("\n> [%s] Command line options\n", sSDKsample);
+    printf("\t-device=n          (where n=0,1,2.... for the GPU device)\n\n");
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
-    showHelp();
-    return 0;
-  }
+int main(int argc, char **argv)
+{
+    if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
+        showHelp();
+        return 0;
+    }

-  runTest(argc, argv);
+    runTest(argc, argv);
 }

 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
-  bool bTestResults = true;
+void runTest(int argc, char **argv)
+{
+    bool bTestResults = true;

-  // initialize CUDA
-  CUfunction transform = NULL;
+    // initialize CUDA
+    CUfunction transform = NULL;

-  if (initCUDA(argc, argv, &transform) != CUDA_SUCCESS) {
-    exit(EXIT_FAILURE);
-  }
+    if (initCUDA(argc, argv, &transform) != CUDA_SUCCESS) {
+        exit(EXIT_FAILURE);
+    }

-  // load image from disk
-  float *h_data = NULL;
-  unsigned int width, height;
-  char *image_path = sdkFindFilePath(image_filename, argv[0]);
+    // load image from disk
+    float       *h_data = NULL;
+    unsigned int width, height;
+    char        *image_path = sdkFindFilePath(image_filename, argv[0]);

-  if (image_path == NULL) {
-    printf("Unable to find image file: '%s'\n", image_filename);
-    exit(EXIT_FAILURE);
-  }
+    if (image_path == NULL) {
+        printf("Unable to find image file: '%s'\n", image_filename);
+        exit(EXIT_FAILURE);
+    }

-  sdkLoadPGM(image_path, &h_data, &width, &height);
+    sdkLoadPGM(image_path, &h_data, &width, &height);

-  size_t size = width * height * sizeof(float);
-  printf("Loaded '%s', %d x %d pixels\n", image_filename, width, height);
+    size_t size = width * height * sizeof(float);
+    printf("Loaded '%s', %d x %d pixels\n", image_filename, width, height);

-  // load reference image from image (output)
-  float *h_data_ref = (float *)malloc(size);
-  char *ref_path = sdkFindFilePath(ref_filename, argv[0]);
+    // load reference image from image (output)
+    float *h_data_ref = (float *)malloc(size);
+    char  *ref_path   = sdkFindFilePath(ref_filename, argv[0]);

-  if (ref_path == NULL) {
-    printf("Unable to find reference file %s\n", ref_filename);
-    exit(EXIT_FAILURE);
-  }
+    if (ref_path == NULL) {
+        printf("Unable to find reference file %s\n", ref_filename);
+        exit(EXIT_FAILURE);
+    }

-  sdkLoadPGM(ref_path, &h_data_ref, &width, &height);
+    sdkLoadPGM(ref_path, &h_data_ref, &width, &height);

-  // allocate device memory for result
-  CUdeviceptr d_data = (CUdeviceptr)NULL;
-  checkCudaErrors(cuMemAlloc(&d_data, size));
+    // allocate device memory for result
+    CUdeviceptr d_data = (CUdeviceptr)NULL;
+    checkCudaErrors(cuMemAlloc(&d_data, size));

-  // allocate array and copy image data
-  CUarray cu_array;
-  CUDA_ARRAY_DESCRIPTOR desc;
-  desc.Format = CU_AD_FORMAT_FLOAT;
-  desc.NumChannels = 1;
-  desc.Width = width;
-  desc.Height = height;
-  checkCudaErrors(cuArrayCreate(&cu_array, &desc));
-  CUDA_MEMCPY2D copyParam;
-  memset(&copyParam, 0, sizeof(copyParam));
-  copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-  copyParam.dstArray = cu_array;
-  copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
-  copyParam.srcHost = h_data;
-  copyParam.srcPitch = width * sizeof(float);
-  copyParam.WidthInBytes = copyParam.srcPitch;
-  copyParam.Height = height;
-  checkCudaErrors(cuMemcpy2D(&copyParam));
+    // allocate array and copy image data
+    CUarray               cu_array;
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format      = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width       = width;
+    desc.Height      = height;
+    checkCudaErrors(cuArrayCreate(&cu_array, &desc));
+    CUDA_MEMCPY2D copyParam;
+    memset(&copyParam, 0, sizeof(copyParam));
+    copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+    copyParam.dstArray      = cu_array;
+    copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+    copyParam.srcHost       = h_data;
+    copyParam.srcPitch      = width * sizeof(float);
+    copyParam.WidthInBytes  = copyParam.srcPitch;
+    copyParam.Height        = height;
+    checkCudaErrors(cuMemcpy2D(&copyParam));

-  // set texture parameters
-  CUtexObject TexObject;
-  CUDA_RESOURCE_DESC ResDesc;
-  memset(&ResDesc, 0, sizeof(CUDA_RESOURCE_DESC));
-  ResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-  ResDesc.res.array.hArray = cu_array;
+    // set texture parameters
+    CUtexObject        TexObject;
+    CUDA_RESOURCE_DESC ResDesc;
+    memset(&ResDesc, 0, sizeof(CUDA_RESOURCE_DESC));
+    ResDesc.resType          = CU_RESOURCE_TYPE_ARRAY;
+    ResDesc.res.array.hArray = cu_array;

-  CUDA_TEXTURE_DESC TexDesc;
-  memset(&TexDesc, 0, sizeof(CUDA_TEXTURE_DESC));
-  TexDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
-  TexDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
-  TexDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
-  TexDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
-  TexDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+    CUDA_TEXTURE_DESC TexDesc;
+    memset(&TexDesc, 0, sizeof(CUDA_TEXTURE_DESC));
+    TexDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
+    TexDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
+    TexDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
+    TexDesc.filterMode     = CU_TR_FILTER_MODE_LINEAR;
+    TexDesc.flags          = CU_TRSF_NORMALIZED_COORDINATES;

-  checkCudaErrors(cuTexObjectCreate(&TexObject, &ResDesc, &TexDesc, NULL));
+    checkCudaErrors(cuTexObjectCreate(&TexObject, &ResDesc, &TexDesc, NULL));

-  // There are two ways to launch CUDA kernels via the Driver API.
-  // In this CUDA Sample, we illustrate both ways to pass parameters
-  // and specify parameters.  By default we use the simpler method.
-  int block_size = 8;
-  StopWatchInterface *timer = NULL;
+    // There are two ways to launch CUDA kernels via the Driver API.
+    // In this CUDA Sample, we illustrate both ways to pass parameters
+    // and specify parameters.  By default we use the simpler method.
+    int                 block_size = 8;
+    StopWatchInterface *timer      = NULL;

-  if (1) {
-    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
-    // Launching (simpler method)
-    void *args[5] = {&d_data, &width, &height, &angle, &TexObject};
+    if (1) {
+        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
+        // Launching (simpler method)
+        void *args[5] = {&d_data, &width, &height, &angle, &TexObject};
+
+        checkCudaErrors(cuLaunchKernel(
+            transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
+        checkCudaErrors(cuCtxSynchronize());
+        sdkCreateTimer(&timer);
+        sdkStartTimer(&timer);
+
+        // launch kernel again for performance measurement
+        checkCudaErrors(cuLaunchKernel(
+            transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
+    }
+    else {
+        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
+        // Launching (advanced method)
+        int  offset = 0;
+        char argBuffer[256];
+
+        // pass in launch parameters (not actually de-referencing CUdeviceptr).
+        // CUdeviceptr is
+        // storing the value of the parameters
+        *((CUdeviceptr *)&argBuffer[offset]) = d_data;
+        offset += sizeof(d_data);
+        *((unsigned int *)&argBuffer[offset]) = width;
+        offset += sizeof(width);
+        *((unsigned int *)&argBuffer[offset]) = height;
+        offset += sizeof(height);
+        *((float *)&argBuffer[offset]) = angle;
+        offset += sizeof(angle);
+        *((CUtexObject *)&argBuffer[offset]) = TexObject;
+        offset += sizeof(TexObject);
+
+        void *kernel_launch_config[5] = {
+            CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
+
+        // new CUDA 4.0 Driver API Kernel launch call (warmup)
+        checkCudaErrors(cuLaunchKernel(transform,
+                                       (width / block_size),
+                                       (height / block_size),
+                                       1,
+                                       block_size,
+                                       block_size,
+                                       1,
+                                       0,
+                                       NULL,
+                                       NULL,
+                                       (void **)&kernel_launch_config));
+        checkCudaErrors(cuCtxSynchronize());
+        sdkCreateTimer(&timer);
+        sdkStartTimer(&timer);
+
+        // launch kernel again for performance measurement
+        checkCudaErrors(cuLaunchKernel(transform,
+                                       (width / block_size),
+                                       (height / block_size),
+                                       1,
+                                       block_size,
+                                       block_size,
+                                       1,
+                                       0,
+                                       0,
+                                       NULL,
+                                       (void **)&kernel_launch_config));
+    }

-    checkCudaErrors(cuLaunchKernel(transform, (width / block_size),
-                                   (height / block_size), 1, block_size,
-                                   block_size, 1, 0, NULL, args, NULL));
    checkCudaErrors(cuCtxSynchronize());
-    sdkCreateTimer(&timer);
-    sdkStartTimer(&timer);
+    sdkStopTimer(&timer);
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
+    sdkDeleteTimer(&timer);

-    // launch kernel again for performance measurement
-    checkCudaErrors(cuLaunchKernel(transform, (width / block_size),
-                                   (height / block_size), 1, block_size,
-                                   block_size, 1, 0, NULL, args, NULL));
-  } else {
-    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
-    // Launching (advanced method)
-    int offset = 0;
-    char argBuffer[256];
+    // allocate mem for the result on host side
+    float *h_odata = (float *)malloc(size);
+    // copy result from device to host
+    checkCudaErrors(cuMemcpyDtoH(h_odata, d_data, size));

-    // pass in launch parameters (not actually de-referencing CUdeviceptr).
-    // CUdeviceptr is
-    // storing the value of the parameters
-    *((CUdeviceptr *)&argBuffer[offset]) = d_data;
-    offset += sizeof(d_data);
-    *((unsigned int *)&argBuffer[offset]) = width;
-    offset += sizeof(width);
-    *((unsigned int *)&argBuffer[offset]) = height;
-    offset += sizeof(height);
-    *((float *)&argBuffer[offset]) = angle;
-    offset += sizeof(angle);
-    *((CUtexObject *)&argBuffer[offset]) = TexObject;
-    offset += sizeof(TexObject);
+    // write result to file
+    char output_filename[1024];
+    strcpy(output_filename, image_path);
+    strcpy(output_filename + strlen(image_path) - 4, "_out.pgm");
+    sdkSavePGM(output_filename, h_odata, width, height);
+    printf("Wrote '%s'\n", output_filename);

-    void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
-                                     CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
-                                     CU_LAUNCH_PARAM_END};
+    // write regression file if necessary
+    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
+        // write file for regression test
+        sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
+    }
+    else {
+        // We need to reload the data from disk, because it is inverted upon output
+        sdkLoadPGM(output_filename, &h_odata, &width, &height);

-    // new CUDA 4.0 Driver API Kernel launch call (warmup)
-    checkCudaErrors(cuLaunchKernel(
-        transform, (width / block_size), (height / block_size), 1, block_size,
-        block_size, 1, 0, NULL, NULL, (void **)&kernel_launch_config));
-    checkCudaErrors(cuCtxSynchronize());
-    sdkCreateTimer(&timer);
-    sdkStartTimer(&timer);
+        printf("Comparing files\n");
+        printf("\toutput:    <%s>\n", output_filename);
+        printf("\treference: <%s>\n", ref_path);
+        bTestResults = compareData(h_odata, h_data_ref, width * height, MIN_EPSILON_ERROR, 0.15f);
+    }

-    // launch kernel again for performance measurement
-    checkCudaErrors(cuLaunchKernel(
-        transform, (width / block_size), (height / block_size), 1, block_size,
-        block_size, 1, 0, 0, NULL, (void **)&kernel_launch_config));
-  }
+    // cleanup memory
+    checkCudaErrors(cuTexObjectDestroy(TexObject));
+    checkCudaErrors(cuMemFree(d_data));
+    checkCudaErrors(cuArrayDestroy(cu_array));

-  checkCudaErrors(cuCtxSynchronize());
-  sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  printf("%.2f Mpixels/sec\n",
-         (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
-  sdkDeleteTimer(&timer);
+    free(image_path);
+    free(ref_path);

-  // allocate mem for the result on host side
-  float *h_odata = (float *)malloc(size);
-  // copy result from device to host
-  checkCudaErrors(cuMemcpyDtoH(h_odata, d_data, size));
+    checkCudaErrors(cuCtxDestroy(cuContext));

-  // write result to file
-  char output_filename[1024];
-  strcpy(output_filename, image_path);
-  strcpy(output_filename + strlen(image_path) - 4, "_out.pgm");
-  sdkSavePGM(output_filename, h_odata, width, height);
-  printf("Wrote '%s'\n", output_filename);
-
-  // write regression file if necessary
-  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
-    // write file for regression test
-    sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f,
-                        false);
-  } else {
-    // We need to reload the data from disk, because it is inverted upon output
-    sdkLoadPGM(output_filename, &h_odata, &width, &height);
-
-    printf("Comparing files\n");
-    printf("\toutput:    <%s>\n", output_filename);
-    printf("\treference: <%s>\n", ref_path);
-    bTestResults = compareData(h_odata, h_data_ref, width * height,
-                               MIN_EPSILON_ERROR, 0.15f);
-  }
-
-  // cleanup memory
-  checkCudaErrors(cuTexObjectDestroy(TexObject));
-  checkCudaErrors(cuMemFree(d_data));
-  checkCudaErrors(cuArrayDestroy(cu_array));
-
-  free(image_path);
-  free(ref_path);
-
-  checkCudaErrors(cuCtxDestroy(cuContext));
-
-  exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -293,45 +307,44 @@ void runTest(int argc, char **argv) {
 //! kernel function.  After the module is loaded, cuModuleGetFunction
 //! retrieves the CUDA function pointer "cuFunction"
 ////////////////////////////////////////////////////////////////////////////////
-static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
-  CUfunction cuFunction = 0;
-  int major = 0, minor = 0, devID = 0;
-  char deviceName[100];
-  string module_path;
+static CUresult initCUDA(int argc, char **argv, CUfunction *transform)
+{
+    CUfunction cuFunction = 0;
+    int        major = 0, minor = 0, devID = 0;
+    char       deviceName[100];
+    string     module_path;

-  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
+    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);

-  // get compute capabilities and the devicename
-  checkCudaErrors(cuDeviceGetAttribute(
-      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
-  checkCudaErrors(cuDeviceGetAttribute(
-      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
-  checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
-  printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
+    // get compute capabilities and the devicename
+    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+    checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
+    printf("> GPU Device has SM %d.%d compute capability\n", major, minor);

-  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
+    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));

-  // first search for the module_path before we try to load the results
-  std::ostringstream fatbin;
+    // first search for the module_path before we try to load the results
+    std::ostringstream fatbin;

-  if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
-    exit(EXIT_FAILURE);
-  } else {
-    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
-  }
+    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
+        exit(EXIT_FAILURE);
+    }
+    else {
+        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
+    }

-  if (!fatbin.str().size()) {
-    printf("fatbin file empty. exiting..\n");
-    exit(EXIT_FAILURE);
-  }
+    if (!fatbin.str().size()) {
+        printf("fatbin file empty. exiting..\n");
+        exit(EXIT_FAILURE);
+    }

-  // Create module from binary file (FATBIN)
-  checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
+    // Create module from binary file (FATBIN)
+    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));

-  checkCudaErrors(
-      cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
+    checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));

-  *transform = cuFunction;
+    *transform = cuFunction;

-  return CUDA_SUCCESS;
+    return CUDA_SUCCESS;
 }
--- a/Samples/0_Introduction/simpleTextureDrv/simpleTexture_kernel.cu
+++ b/Samples/0_Introduction/simpleTextureDrv/simpleTexture_kernel.cu
@ -33,23 +33,22 @@
 //! Transform an image using texture lookups
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ void transformKernel(float *g_odata, int width,
-                                           int height, float theta,
-                                           CUtexObject tex) {
-  // calculate normalized texture coordinates
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta, CUtexObject tex)
+{
+    // calculate normalized texture coordinates
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

-  float u = (float)x - (float)width / 2;
-  float v = (float)y - (float)height / 2;
-  float tu = u * cosf(theta) - v * sinf(theta);
-  float tv = v * cosf(theta) + u * sinf(theta);
+    float u  = (float)x - (float)width / 2;
+    float v  = (float)y - (float)height / 2;
+    float tu = u * cosf(theta) - v * sinf(theta);
+    float tv = v * cosf(theta) + u * sinf(theta);

-  tu /= (float)width;
-  tv /= (float)height;
+    tu /= (float)width;
+    tv /= (float)height;

-  // read from texture and write to global memory
-  g_odata[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
+    // read from texture and write to global memory
+    g_odata[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
 }

-#endif  // #ifndef _SIMPLETEXTURE_KERNEL_H_
+#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
--- a/Samples/0_Introduction/simpleVoteIntrinsics/simpleVoteIntrinsics.cu
+++ b/Samples/0_Introduction/simpleVoteIntrinsics/simpleVoteIntrinsics.cu
@ -53,257 +53,237 @@ static const char *sSDKsample = "[simpleVoteIntrinsics]\0";
 #include "simpleVote_kernel.cuh"

 // Generate the test pattern for Tests 1 and 2
-void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) {
-  // For testing VOTE.Any (all of these threads will return 0)
-  for (int i = 0; i < size / 4; i++) {
-    VOTE_PATTERN[i] = 0x00000000;
-  }
-
-  // For testing VOTE.Any (1/2 these threads will return 1)
-  for (int i = 2 * size / 8; i < 4 * size / 8; i++) {
-    VOTE_PATTERN[i] = (i & 0x01) ? i : 0;
-  }
-
-  // For testing VOTE.all (1/2 of these threads will return 0)
-  for (int i = 2 * size / 4; i < 3 * size / 4; i++) {
-    VOTE_PATTERN[i] = (i & 0x01) ? 0 : i;
-  }
-
-  // For testing VOTE.all (all of these threads will return 1)
-  for (int i = 3 * size / 4; i < 4 * size / 4; i++) {
-    VOTE_PATTERN[i] = 0xffffffff;
-  }
-}
-
-int checkErrors1(unsigned int *h_result, int start, int end, int warp_size,
-                 const char *voteType) {
-  int i, sum = 0;
-
-  for (sum = 0, i = start; i < end; i++) {
-    sum += h_result[i];
-  }
-
-  if (sum > 0) {
-    printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
-
-    for (i = start; i < end; i++) {
-      printf("%d", h_result[i]);
+void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
+{
+    // For testing VOTE.Any (all of these threads will return 0)
+    for (int i = 0; i < size / 4; i++) {
+        VOTE_PATTERN[i] = 0x00000000;
    }

-    printf("%d values FAILED\n", sum);
-  }
-
-  return (sum > 0);
-}
-
-int checkErrors2(unsigned int *h_result, int start, int end, int warp_size,
-                 const char *voteType) {
-  int i, sum = 0;
-
-  for (sum = 0, i = start; i < end; i++) {
-    sum += h_result[i];
-  }
-
-  if (sum != warp_size) {
-    printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
-
-    for (i = start; i < end; i++) {
-      printf("%d", h_result[i]);
+    // For testing VOTE.Any (1/2 these threads will return 1)
+    for (int i = 2 * size / 8; i < 4 * size / 8; i++) {
+        VOTE_PATTERN[i] = (i & 0x01) ? i : 0;
    }

-    printf(" - FAILED\n");
-  }
+    // For testing VOTE.all (1/2 of these threads will return 0)
+    for (int i = 2 * size / 4; i < 3 * size / 4; i++) {
+        VOTE_PATTERN[i] = (i & 0x01) ? 0 : i;
+    }

-  return (sum != warp_size);
+    // For testing VOTE.all (all of these threads will return 1)
+    for (int i = 3 * size / 4; i < 4 * size / 4; i++) {
+        VOTE_PATTERN[i] = 0xffffffff;
+    }
+}
+
+int checkErrors1(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
+{
+    int i, sum = 0;
+
+    for (sum = 0, i = start; i < end; i++) {
+        sum += h_result[i];
+    }
+
+    if (sum > 0) {
+        printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
+
+        for (i = start; i < end; i++) {
+            printf("%d", h_result[i]);
+        }
+
+        printf("%d values FAILED\n", sum);
+    }
+
+    return (sum > 0);
+}
+
+int checkErrors2(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
+{
+    int i, sum = 0;
+
+    for (sum = 0, i = start; i < end; i++) {
+        sum += h_result[i];
+    }
+
+    if (sum != warp_size) {
+        printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
+
+        for (i = start; i < end; i++) {
+            printf("%d", h_result[i]);
+        }
+
+        printf(" - FAILED\n");
+    }
+
+    return (sum != warp_size);
 }

 // Verification code for Kernel #1
-int checkResultsVoteAnyKernel1(unsigned int *h_result, int size,
-                               int warp_size) {
-  int error_count = 0;
+int checkResultsVoteAnyKernel1(unsigned int *h_result, int size, int warp_size)
+{
+    int error_count = 0;

-  error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4,
-                              warp_size, "Vote.Any");
-  error_count +=
-      checkErrors2(h_result, VOTE_DATA_GROUP * warp_size / 4,
-                   2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
-  error_count +=
-      checkErrors2(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4,
-                   3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
-  error_count +=
-      checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
-                   4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
+    error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
+    error_count += checkErrors2(
+        h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
+    error_count += checkErrors2(
+        h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
+    error_count += checkErrors2(
+        h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");

-  printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
-  return error_count;
+    printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
+    return error_count;
 }

 // Verification code for Kernel #2
-int checkResultsVoteAllKernel2(unsigned int *h_result, int size,
-                               int warp_size) {
-  int error_count = 0;
+int checkResultsVoteAllKernel2(unsigned int *h_result, int size, int warp_size)
+{
+    int error_count = 0;

-  error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4,
-                              warp_size, "Vote.All");
-  error_count +=
-      checkErrors1(h_result, VOTE_DATA_GROUP * warp_size / 4,
-                   2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
-  error_count +=
-      checkErrors1(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4,
-                   3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
-  error_count +=
-      checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
-                   4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
+    error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
+    error_count += checkErrors1(
+        h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
+    error_count += checkErrors1(
+        h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
+    error_count += checkErrors2(
+        h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");

-  printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
-  return error_count;
+    printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
+    return error_count;
 }

 // Verification code for Kernel #3
-int checkResultsVoteAnyKernel3(bool *hinfo, int size) {
-  int i, error_count = 0;
+int checkResultsVoteAnyKernel3(bool *hinfo, int size)
+{
+    int i, error_count = 0;

-  for (i = 0; i < size * 3; i++) {
-    switch (i % 3) {
-      case 0:
+    for (i = 0; i < size * 3; i++) {
+        switch (i % 3) {
+        case 0:

-        // First warp should be all zeros.
-        if (hinfo[i] != (i >= size * 1)) {
-          error_count++;
+            // First warp should be all zeros.
+            if (hinfo[i] != (i >= size * 1)) {
+                error_count++;
+            }
+
+            break;
+
+        case 1:
+
+            // First warp and half of second should be all zeros.
+            if (hinfo[i] != (i >= size * 3 / 2)) {
+                error_count++;
+            }
+
+            break;
+
+        case 2:
+
+            // First two warps should be all zeros.
+            if (hinfo[i] != (i >= size * 2)) {
+                error_count++;
+            }
+
+            break;
        }
-
-        break;
-
-      case 1:
-
-        // First warp and half of second should be all zeros.
-        if (hinfo[i] != (i >= size * 3 / 2)) {
-          error_count++;
-        }
-
-        break;
-
-      case 2:
-
-        // First two warps should be all zeros.
-        if (hinfo[i] != (i >= size * 2)) {
-          error_count++;
-        }
-
-        break;
    }
-  }

-  printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
-  return error_count;
+    printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
+    return error_count;
 }

-int main(int argc, char **argv) {
-  unsigned int *h_input, *h_result;
-  unsigned int *d_input, *d_result;
+int main(int argc, char **argv)
+{
+    unsigned int *h_input, *h_result;
+    unsigned int *d_input, *d_result;

-  bool *dinfo = NULL, *hinfo = NULL;
-  int error_count[3] = {0, 0, 0};
+    bool *dinfo = NULL, *hinfo = NULL;
+    int   error_count[3] = {0, 0, 0};

-  cudaDeviceProp deviceProp;
-  int devID, warp_size = 32;
+    cudaDeviceProp deviceProp;
+    int            devID, warp_size = 32;

-  printf("%s\n", sSDKsample);
+    printf("%s\n", sSDKsample);

-  // This will pick the best possible CUDA capable device
-  devID = findCudaDevice(argc, (const char **)argv);
+    // This will pick the best possible CUDA capable device
+    devID = findCudaDevice(argc, (const char **)argv);

-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

-  // Statistics about the GPU device
-  printf(
-      "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
-      deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
+    // Statistics about the GPU device
+    printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
+           deviceProp.multiProcessorCount,
+           deviceProp.major,
+           deviceProp.minor);

-  h_input = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size *
-                                   sizeof(unsigned int));
-  h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size *
-                                    sizeof(unsigned int));
-  checkCudaErrors(
-      cudaMalloc(reinterpret_cast<void **>(&d_input),
-                 VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
-  checkCudaErrors(
-      cudaMalloc(reinterpret_cast<void **>(&d_result),
-                 VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
-  genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size);
-  checkCudaErrors(cudaMemcpy(d_input, h_input,
-                             VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
-                             cudaMemcpyHostToDevice));
+    h_input  = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
+    h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
+    checkCudaErrors(
+        cudaMalloc(reinterpret_cast<void **>(&d_input), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
+    checkCudaErrors(
+        cudaMalloc(reinterpret_cast<void **>(&d_result), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
+    genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size);
+    checkCudaErrors(
+        cudaMemcpy(d_input, h_input, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyHostToDevice));

-  // Start of Vote Any Test Kernel #1
-  printf("[VOTE Kernel Test 1/3]\n");
-  printf("\tRunning <<Vote.Any>> kernel1 ...\n");
-  {
-    checkCudaErrors(cudaDeviceSynchronize());
-    dim3 gridBlock(1, 1);
-    dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
-    VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result,
-                                               VOTE_DATA_GROUP * warp_size);
-    getLastCudaError("VoteAnyKernel() execution failed\n");
-    checkCudaErrors(cudaDeviceSynchronize());
-  }
-  checkCudaErrors(cudaMemcpy(h_result, d_result,
-                             VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
-                             cudaMemcpyDeviceToHost));
-  error_count[0] += checkResultsVoteAnyKernel1(
-      h_result, VOTE_DATA_GROUP * warp_size, warp_size);
+    // Start of Vote Any Test Kernel #1
+    printf("[VOTE Kernel Test 1/3]\n");
+    printf("\tRunning <<Vote.Any>> kernel1 ...\n");
+    {
+        checkCudaErrors(cudaDeviceSynchronize());
+        dim3 gridBlock(1, 1);
+        dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
+        VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
+        getLastCudaError("VoteAnyKernel() execution failed\n");
+        checkCudaErrors(cudaDeviceSynchronize());
+    }
+    checkCudaErrors(
+        cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
+    error_count[0] += checkResultsVoteAnyKernel1(h_result, VOTE_DATA_GROUP * warp_size, warp_size);

-  // Start of Vote All Test Kernel #2
-  printf("\n[VOTE Kernel Test 2/3]\n");
-  printf("\tRunning <<Vote.All>> kernel2 ...\n");
-  {
-    checkCudaErrors(cudaDeviceSynchronize());
-    dim3 gridBlock(1, 1);
-    dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
-    VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result,
-                                               VOTE_DATA_GROUP * warp_size);
-    getLastCudaError("VoteAllKernel() execution failed\n");
-    checkCudaErrors(cudaDeviceSynchronize());
-  }
-  checkCudaErrors(cudaMemcpy(h_result, d_result,
-                             VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
-                             cudaMemcpyDeviceToHost));
-  error_count[1] += checkResultsVoteAllKernel2(
-      h_result, VOTE_DATA_GROUP * warp_size, warp_size);
+    // Start of Vote All Test Kernel #2
+    printf("\n[VOTE Kernel Test 2/3]\n");
+    printf("\tRunning <<Vote.All>> kernel2 ...\n");
+    {
+        checkCudaErrors(cudaDeviceSynchronize());
+        dim3 gridBlock(1, 1);
+        dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
+        VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
+        getLastCudaError("VoteAllKernel() execution failed\n");
+        checkCudaErrors(cudaDeviceSynchronize());
+    }
+    checkCudaErrors(
+        cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
+    error_count[1] += checkResultsVoteAllKernel2(h_result, VOTE_DATA_GROUP * warp_size, warp_size);

-  // Second Vote Kernel Test #3 (both Any/All)
-  hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool)));
-  cudaMalloc(reinterpret_cast<void **>(&dinfo),
-             warp_size * 3 * 3 * sizeof(bool));
-  cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool),
-             cudaMemcpyHostToDevice);
+    // Second Vote Kernel Test #3 (both Any/All)
+    hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool)));
+    cudaMalloc(reinterpret_cast<void **>(&dinfo), warp_size * 3 * 3 * sizeof(bool));
+    cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyHostToDevice);

-  printf("\n[VOTE Kernel Test 3/3]\n");
-  printf("\tRunning <<Vote.Any>> kernel3 ...\n");
-  {
-    checkCudaErrors(cudaDeviceSynchronize());
-    VoteAnyKernel3<<<1, warp_size * 3>>>(dinfo, warp_size);
-    checkCudaErrors(cudaDeviceSynchronize());
-  }
+    printf("\n[VOTE Kernel Test 3/3]\n");
+    printf("\tRunning <<Vote.Any>> kernel3 ...\n");
+    {
+        checkCudaErrors(cudaDeviceSynchronize());
+        VoteAnyKernel3<<<1, warp_size * 3>>>(dinfo, warp_size);
+        checkCudaErrors(cudaDeviceSynchronize());
+    }

-  cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool),
-             cudaMemcpyDeviceToHost);
+    cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyDeviceToHost);

-  error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3);
+    error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3);

-  // Now free these resources for Test #1,2
-  checkCudaErrors(cudaFree(d_input));
-  checkCudaErrors(cudaFree(d_result));
-  free(h_input);
-  free(h_result);
+    // Now free these resources for Test #1,2
+    checkCudaErrors(cudaFree(d_input));
+    checkCudaErrors(cudaFree(d_result));
+    free(h_input);
+    free(h_result);

-  // Free resources from Test #3
-  free(hinfo);
-  cudaFree(dinfo);
+    // Free resources from Test #3
+    free(hinfo);
+    cudaFree(dinfo);

-  printf("\tShutting down...\n");
+    printf("\tShutting down...\n");

-  return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0)
-             ? EXIT_SUCCESS
-             : EXIT_FAILURE;
+    return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
 }
--- a/Samples/0_Introduction/simpleVoteIntrinsics/simpleVote_kernel.cuh
+++ b/Samples/0_Introduction/simpleVoteIntrinsics/simpleVote_kernel.cuh
@ -38,43 +38,44 @@
 // If ANY one of the threads (within the warp) of the predicated condition
 // returns a non-zero value, then all threads within this warp will return a
 // non-zero value
-__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result,
-                               int size) {
-  int tx = threadIdx.x;
+__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result, int size)
+{
+    int tx = threadIdx.x;

-  int mask = 0xffffffff;
-  result[tx] = __any_sync(mask, input[tx]);
+    int mask   = 0xffffffff;
+    result[tx] = __any_sync(mask, input[tx]);
 }

 // Kernel #2 tests the across-the-warp vote(all) intrinsic.
 // If ALL of the threads (within the warp) of the predicated condition returns
 // a non-zero value, then all threads within this warp will return a non-zero
 // value
-__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result,
-                               int size) {
-  int tx = threadIdx.x;
+__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result, int size)
+{
+    int tx = threadIdx.x;

-  int mask = 0xffffffff;
-  result[tx] = __all_sync(mask, input[tx]);
+    int mask   = 0xffffffff;
+    result[tx] = __all_sync(mask, input[tx]);
 }

 // Kernel #3 is a directed test for the across-the-warp vote(all) intrinsic.
 // This kernel will test for conditions across warps, and within half warps
-__global__ void VoteAnyKernel3(bool *info, int warp_size) {
-  int tx = threadIdx.x;
-  unsigned int mask = 0xffffffff;
-  bool *offs = info + (tx * 3);
+__global__ void VoteAnyKernel3(bool *info, int warp_size)
+{
+    int          tx   = threadIdx.x;
+    unsigned int mask = 0xffffffff;
+    bool        *offs = info + (tx * 3);

-  // The following should hold true for the second and third warp
-  *offs = __any_sync(mask, (tx >= (warp_size * 3) / 2));
-  // The following should hold true for the "upper half" of the second warp,
-  // and all of the third warp
-  *(offs + 1) = (tx >= (warp_size * 3) / 2 ? true : false);
+    // The following should hold true for the second and third warp
+    *offs = __any_sync(mask, (tx >= (warp_size * 3) / 2));
+    // The following should hold true for the "upper half" of the second warp,
+    // and all of the third warp
+    *(offs + 1) = (tx >= (warp_size * 3) / 2 ? true : false);

-  // The following should hold true for the third warp only
-  if (__all_sync(mask, (tx >= (warp_size * 3) / 2))) {
-    *(offs + 2) = true;
-  }
+    // The following should hold true for the third warp only
+    if (__all_sync(mask, (tx >= (warp_size * 3) / 2))) {
+        *(offs + 2) = true;
+    }
 }

 #endif
--- a/Samples/0_Introduction/simpleZeroCopy/simpleZeroCopy.cu
+++ b/Samples/0_Introduction/simpleZeroCopy/simpleZeroCopy.cu
@ -41,12 +41,13 @@
 #endif

 /* Add two vectors on the GPU */
-__global__ void vectorAddGPU(float *a, float *b, float *c, int N) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void vectorAddGPU(float *a, float *b, float *c, int N)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;

-  if (idx < N) {
-    c[idx] = a[idx] + b[idx];
-  }
+    if (idx < N) {
+        c[idx] = a[idx] + b[idx];
+    }
 }

 // Allocate generic memory with malloc() and pin it laster instead of using
@ -54,194 +55,196 @@ __global__ void vectorAddGPU(float *a, float *b, float *c, int N) {
 bool bPinGenericMemory = false;

 // Macro to aligned up to the memory size in question
-#define MEMORY_ALIGNMENT 4096
+#define MEMORY_ALIGNMENT  4096
 #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))

-int main(int argc, char **argv) {
-  int n, nelem, deviceCount;
-  int idev = 0;  // use default device 0
-  char *device = NULL;
-  unsigned int flags;
-  size_t bytes;
-  float *a, *b, *c;           // Pinned memory allocated on the CPU
-  float *a_UA, *b_UA, *c_UA;  // Non-4K Aligned Pinned memory on the CPU
-  float *d_a, *d_b, *d_c;     // Device pointers for mapped memory
-  float errorNorm, refNorm, ref, diff;
-  cudaDeviceProp deviceProp;
+int main(int argc, char **argv)
+{
+    int            n, nelem, deviceCount;
+    int            idev   = 0; // use default device 0
+    char          *device = NULL;
+    unsigned int   flags;
+    size_t         bytes;
+    float         *a, *b, *c;          // Pinned memory allocated on the CPU
+    float         *a_UA, *b_UA, *c_UA; // Non-4K Aligned Pinned memory on the CPU
+    float         *d_a, *d_b, *d_c;    // Device pointers for mapped memory
+    float          errorNorm, refNorm, ref, diff;
+    cudaDeviceProp deviceProp;

-  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
-    printf("Usage:  simpleZeroCopy [OPTION]\n\n");
-    printf("Options:\n");
-    printf("  --device=[device #]  Specify the device to be used\n");
-    printf(
-        "  --use_generic_memory (optional) use generic page-aligned for system "
-        "memory\n");
-    return EXIT_SUCCESS;
-  }
-
-  /* Get the device selected by the user or default to 0, and then set it. */
-  if (getCmdLineArgumentString(argc, (const char **)argv, "device", &device)) {
-    cudaGetDeviceCount(&deviceCount);
-    idev = atoi(device);
-
-    if (idev >= deviceCount || idev < 0) {
-      fprintf(stderr,
-              "Device number %d is invalid, will use default CUDA device 0.\n",
-              idev);
-      idev = 0;
+    if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
+        printf("Usage:  simpleZeroCopy [OPTION]\n\n");
+        printf("Options:\n");
+        printf("  --device=[device #]  Specify the device to be used\n");
+        printf("  --use_generic_memory (optional) use generic page-aligned for system "
+               "memory\n");
+        return EXIT_SUCCESS;
    }
-  }

-  // if GPU found supports SM 1.2, then continue, otherwise we exit
-  if (!checkCudaCapabilities(1, 2)) {
-    exit(EXIT_SUCCESS);
-  }
+    /* Get the device selected by the user or default to 0, and then set it. */
+    if (getCmdLineArgumentString(argc, (const char **)argv, "device", &device)) {
+        cudaGetDeviceCount(&deviceCount);
+        idev = atoi(device);

-  if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
+        if (idev >= deviceCount || idev < 0) {
+            fprintf(stderr, "Device number %d is invalid, will use default CUDA device 0.\n", idev);
+            idev = 0;
+        }
+    }
+
+    // if GPU found supports SM 1.2, then continue, otherwise we exit
+    if (!checkCudaCapabilities(1, 2)) {
+        exit(EXIT_SUCCESS);
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
 #if defined(__APPLE__) || defined(MACOSX)
-    bPinGenericMemory = false;  // Generic Pinning of System Paged memory is not
-                                // currently supported on Mac OSX
+        bPinGenericMemory = false; // Generic Pinning of System Paged memory is not
+                                   // currently supported on Mac OSX
 #else
-    bPinGenericMemory = true;
+        bPinGenericMemory = true;
 #endif
-  }
+    }

-  if (bPinGenericMemory) {
-    printf("> Using Generic System Paged Memory (malloc)\n");
-  } else {
-    printf("> Using CUDA Host Allocated (cudaHostAlloc)\n");
-  }
+    if (bPinGenericMemory) {
+        printf("> Using Generic System Paged Memory (malloc)\n");
+    }
+    else {
+        printf("> Using CUDA Host Allocated (cudaHostAlloc)\n");
+    }

-  checkCudaErrors(cudaSetDevice(idev));
+    checkCudaErrors(cudaSetDevice(idev));

-  /* Verify the selected device supports mapped memory and set the device
-     flags for mapping host memory. */
+    /* Verify the selected device supports mapped memory and set the device
+       flags for mapping host memory. */

-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, idev));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, idev));

 #if CUDART_VERSION >= 2020

-  if (!deviceProp.canMapHostMemory) {
-    fprintf(stderr, "Device %d does not support mapping CPU host memory!\n",
-            idev);
+    if (!deviceProp.canMapHostMemory) {
+        fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", idev);
+
+        exit(EXIT_SUCCESS);
+    }
+
+    checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
+#else
+    fprintf(stderr,
+            "CUDART version %d.%d does not support "
+            "<cudaDeviceProp.canMapHostMemory> field\n",
+            ,
+            CUDART_VERSION / 1000,
+            (CUDART_VERSION % 100) / 10);

    exit(EXIT_SUCCESS);
-  }
-
-  checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
-#else
-  fprintf(stderr,
-          "CUDART version %d.%d does not support "
-          "<cudaDeviceProp.canMapHostMemory> field\n",
-          , CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10);
-
-  exit(EXIT_SUCCESS);
 #endif

 #if CUDART_VERSION < 4000

-  if (bPinGenericMemory) {
-    fprintf(
-        stderr,
-        "CUDART version %d.%d does not support <cudaHostRegister> function\n",
-        CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10);
+    if (bPinGenericMemory) {
+        fprintf(stderr,
+                "CUDART version %d.%d does not support <cudaHostRegister> function\n",
+                CUDART_VERSION / 1000,
+                (CUDART_VERSION % 100) / 10);

-    exit(EXIT_SUCCESS);
-  }
+        exit(EXIT_SUCCESS);
+    }

 #endif

-  /* Allocate mapped CPU memory. */
+    /* Allocate mapped CPU memory. */

-  nelem = 1048576;
-  bytes = nelem * sizeof(float);
+    nelem = 1048576;
+    bytes = nelem * sizeof(float);

-  if (bPinGenericMemory) {
+    if (bPinGenericMemory) {
 #if CUDART_VERSION >= 4000
-    a_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
-    b_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
-    c_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
+        a_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
+        b_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
+        c_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);

-    // We need to ensure memory is aligned to 4K (so we will need to padd memory
-    // accordingly)
-    a = (float *)ALIGN_UP(a_UA, MEMORY_ALIGNMENT);
-    b = (float *)ALIGN_UP(b_UA, MEMORY_ALIGNMENT);
-    c = (float *)ALIGN_UP(c_UA, MEMORY_ALIGNMENT);
+        // We need to ensure memory is aligned to 4K (so we will need to padd memory
+        // accordingly)
+        a = (float *)ALIGN_UP(a_UA, MEMORY_ALIGNMENT);
+        b = (float *)ALIGN_UP(b_UA, MEMORY_ALIGNMENT);
+        c = (float *)ALIGN_UP(c_UA, MEMORY_ALIGNMENT);

-    checkCudaErrors(cudaHostRegister(a, bytes, cudaHostRegisterMapped));
-    checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped));
-    checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped));
+        checkCudaErrors(cudaHostRegister(a, bytes, cudaHostRegisterMapped));
+        checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped));
+        checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped));
 #endif
-  } else {
+    }
+    else {
 #if CUDART_VERSION >= 2020
-    flags = cudaHostAllocMapped;
-    checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags));
-    checkCudaErrors(cudaHostAlloc((void **)&b, bytes, flags));
-    checkCudaErrors(cudaHostAlloc((void **)&c, bytes, flags));
+        flags = cudaHostAllocMapped;
+        checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags));
+        checkCudaErrors(cudaHostAlloc((void **)&b, bytes, flags));
+        checkCudaErrors(cudaHostAlloc((void **)&c, bytes, flags));
 #endif
-  }
+    }

-  /* Initialize the vectors. */
+    /* Initialize the vectors. */

-  for (n = 0; n < nelem; n++) {
-    a[n] = rand() / (float)RAND_MAX;
-    b[n] = rand() / (float)RAND_MAX;
-  }
+    for (n = 0; n < nelem; n++) {
+        a[n] = rand() / (float)RAND_MAX;
+        b[n] = rand() / (float)RAND_MAX;
+    }

    /* Get the device pointers for the pinned CPU memory mapped into the GPU
       memory space. */

 #if CUDART_VERSION >= 2020
-  checkCudaErrors(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0));
-  checkCudaErrors(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0));
-  checkCudaErrors(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0));
+    checkCudaErrors(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0));
+    checkCudaErrors(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0));
+    checkCudaErrors(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0));
 #endif

-  /* Call the GPU kernel using the CPU pointers residing in CPU mapped memory.
-   */
-  printf("> vectorAddGPU kernel will add vectors using mapped CPU memory...\n");
-  dim3 block(256);
-  dim3 grid((unsigned int)ceil(nelem / (float)block.x));
-  vectorAddGPU<<<grid, block>>>(d_a, d_b, d_c, nelem);
-  checkCudaErrors(cudaDeviceSynchronize());
-  getLastCudaError("vectorAddGPU() execution failed");
+    /* Call the GPU kernel using the CPU pointers residing in CPU mapped memory.
+     */
+    printf("> vectorAddGPU kernel will add vectors using mapped CPU memory...\n");
+    dim3 block(256);
+    dim3 grid((unsigned int)ceil(nelem / (float)block.x));
+    vectorAddGPU<<<grid, block>>>(d_a, d_b, d_c, nelem);
+    checkCudaErrors(cudaDeviceSynchronize());
+    getLastCudaError("vectorAddGPU() execution failed");

-  /* Compare the results */
+    /* Compare the results */

-  printf("> Checking the results from vectorAddGPU() ...\n");
-  errorNorm = 0.f;
-  refNorm = 0.f;
+    printf("> Checking the results from vectorAddGPU() ...\n");
+    errorNorm = 0.f;
+    refNorm   = 0.f;

-  for (n = 0; n < nelem; n++) {
-    ref = a[n] + b[n];
-    diff = c[n] - ref;
-    errorNorm += diff * diff;
-    refNorm += ref * ref;
-  }
+    for (n = 0; n < nelem; n++) {
+        ref  = a[n] + b[n];
+        diff = c[n] - ref;
+        errorNorm += diff * diff;
+        refNorm += ref * ref;
+    }

-  errorNorm = (float)sqrt((double)errorNorm);
-  refNorm = (float)sqrt((double)refNorm);
+    errorNorm = (float)sqrt((double)errorNorm);
+    refNorm   = (float)sqrt((double)refNorm);

-  /* Memory clean up */
+    /* Memory clean up */

-  printf("> Releasing CPU memory...\n");
+    printf("> Releasing CPU memory...\n");

-  if (bPinGenericMemory) {
+    if (bPinGenericMemory) {
 #if CUDART_VERSION >= 4000
-    checkCudaErrors(cudaHostUnregister(a));
-    checkCudaErrors(cudaHostUnregister(b));
-    checkCudaErrors(cudaHostUnregister(c));
-    free(a_UA);
-    free(b_UA);
-    free(c_UA);
+        checkCudaErrors(cudaHostUnregister(a));
+        checkCudaErrors(cudaHostUnregister(b));
+        checkCudaErrors(cudaHostUnregister(c));
+        free(a_UA);
+        free(b_UA);
+        free(c_UA);
 #endif
-  } else {
+    }
+    else {
 #if CUDART_VERSION >= 2020
-    checkCudaErrors(cudaFreeHost(a));
-    checkCudaErrors(cudaFreeHost(b));
-    checkCudaErrors(cudaFreeHost(c));
+        checkCudaErrors(cudaFreeHost(a));
+        checkCudaErrors(cudaFreeHost(b));
+        checkCudaErrors(cudaFreeHost(c));
 #endif
-  }
+    }

-  exit(errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/systemWideAtomics/README.md
+++ b/Samples/0_Introduction/systemWideAtomics/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## References (for more details)
-
--- a/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu
+++ b/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu
@ -29,113 +29,111 @@
 * memory.
 */

+#include <cstdio>
+#include <ctime>
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 #include <math.h>
 #include <stdint.h>
-#include <cstdio>
-#include <ctime>

 #define min(a, b) (a) < (b) ? (a) : (b)
 #define max(a, b) (a) > (b) ? (a) : (b)

 #define LOOP_NUM 50

-__global__ void atomicKernel(int *atom_arr) {
-  unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
+__global__ void atomicKernel(int *atom_arr)
+{
+    unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;

-  for (int i = 0; i < LOOP_NUM; i++) {
-    // Atomic addition
-    atomicAdd_system(&atom_arr[0], 10);
+    for (int i = 0; i < LOOP_NUM; i++) {
+        // Atomic addition
+        atomicAdd_system(&atom_arr[0], 10);

-    // Atomic exchange
-    atomicExch_system(&atom_arr[1], tid);
+        // Atomic exchange
+        atomicExch_system(&atom_arr[1], tid);

-    // Atomic maximum
-    atomicMax_system(&atom_arr[2], tid);
+        // Atomic maximum
+        atomicMax_system(&atom_arr[2], tid);

-    // Atomic minimum
-    atomicMin_system(&atom_arr[3], tid);
+        // Atomic minimum
+        atomicMin_system(&atom_arr[3], tid);

-    // Atomic increment (modulo 17+1)
-    atomicInc_system((unsigned int *)&atom_arr[4], 17);
+        // Atomic increment (modulo 17+1)
+        atomicInc_system((unsigned int *)&atom_arr[4], 17);

-    // Atomic decrement
-    atomicDec_system((unsigned int *)&atom_arr[5], 137);
+        // Atomic decrement
+        atomicDec_system((unsigned int *)&atom_arr[5], 137);

-    // Atomic compare-and-swap
-    atomicCAS_system(&atom_arr[6], tid - 1, tid);
+        // Atomic compare-and-swap
+        atomicCAS_system(&atom_arr[6], tid - 1, tid);

-    // Bitwise atomic instructions
+        // Bitwise atomic instructions

-    // Atomic AND
-    atomicAnd_system(&atom_arr[7], 2 * tid + 7);
+        // Atomic AND
+        atomicAnd_system(&atom_arr[7], 2 * tid + 7);

-    // Atomic OR
-    atomicOr_system(&atom_arr[8], 1 << tid);
+        // Atomic OR
+        atomicOr_system(&atom_arr[8], 1 << tid);

-    // Atomic XOR
-    atomicXor_system(&atom_arr[9], tid);
-  }
+        // Atomic XOR
+        atomicXor_system(&atom_arr[9], tid);
+    }
 }

-void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
-  for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
-    for (int j = 0; j < LOOP_NUM; j++) {
-      // Atomic addition
-      __sync_fetch_and_add(&atom_arr[0], 10);
+void atomicKernel_CPU(int *atom_arr, int no_of_threads)
+{
+    for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
+        for (int j = 0; j < LOOP_NUM; j++) {
+            // Atomic addition
+            __sync_fetch_and_add(&atom_arr[0], 10);

-      // Atomic exchange
-      __sync_lock_test_and_set(&atom_arr[1], i);
+            // Atomic exchange
+            __sync_lock_test_and_set(&atom_arr[1], i);

-      // Atomic maximum
-      int old, expected;
-      do {
-        expected = atom_arr[2];
-        old = __sync_val_compare_and_swap(&atom_arr[2], expected,
-                                          max(expected, i));
-      } while (old != expected);
+            // Atomic maximum
+            int old, expected;
+            do {
+                expected = atom_arr[2];
+                old      = __sync_val_compare_and_swap(&atom_arr[2], expected, max(expected, i));
+            } while (old != expected);

-      // Atomic minimum
-      do {
-        expected = atom_arr[3];
-        old = __sync_val_compare_and_swap(&atom_arr[3], expected,
-                                          min(expected, i));
-      } while (old != expected);
+            // Atomic minimum
+            do {
+                expected = atom_arr[3];
+                old      = __sync_val_compare_and_swap(&atom_arr[3], expected, min(expected, i));
+            } while (old != expected);

-      // Atomic increment (modulo 17+1)
-      int limit = 17;
-      do {
-        expected = atom_arr[4];
-        old = __sync_val_compare_and_swap(
-            &atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
-      } while (old != expected);
+            // Atomic increment (modulo 17+1)
+            int limit = 17;
+            do {
+                expected = atom_arr[4];
+                old      = __sync_val_compare_and_swap(&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
+            } while (old != expected);

-      // Atomic decrement
-      limit = 137;
-      do {
-        expected = atom_arr[5];
-        old = __sync_val_compare_and_swap(
-            &atom_arr[5], expected,
-            ((expected == 0) || (expected > limit)) ? limit : expected - 1);
-      } while (old != expected);
+            // Atomic decrement
+            limit = 137;
+            do {
+                expected = atom_arr[5];
+                old      = __sync_val_compare_and_swap(
+                    &atom_arr[5], expected, ((expected == 0) || (expected > limit)) ? limit : expected - 1);
+            } while (old != expected);

-      // Atomic compare-and-swap
-      __sync_val_compare_and_swap(&atom_arr[6], i - 1, i);
+            // Atomic compare-and-swap
+            __sync_val_compare_and_swap(&atom_arr[6], i - 1, i);

-      // Bitwise atomic instructions
+            // Bitwise atomic instructions

-      // Atomic AND
-      __sync_fetch_and_and(&atom_arr[7], 2 * i + 7);
+            // Atomic AND
+            __sync_fetch_and_and(&atom_arr[7], 2 * i + 7);

-      // Atomic OR
-      __sync_fetch_and_or(&atom_arr[8], 1 << i);
+            // Atomic OR
+            __sync_fetch_and_or(&atom_arr[8], 1 << i);

-      // Atomic XOR
-      // 11th element should be 0xff
-      __sync_fetch_and_xor(&atom_arr[9], i);
+            // Atomic XOR
+            // 11th element should be 0xff
+            __sync_fetch_and_xor(&atom_arr[9], i);
+        }
    }
-  }
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -145,198 +143,201 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
 //! @param idata      input data as provided to device
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
-int verify(int *testData, const int len) {
-  int val = 0;
+int verify(int *testData, const int len)
+{
+    int val = 0;

-  for (int i = 0; i < len * LOOP_NUM; ++i) {
-    val += 10;
-  }
-
-  if (val != testData[0]) {
-    printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
-    return false;
-  }
-
-  val = 0;
-
-  bool found = false;
-
-  for (int i = 0; i < len; ++i) {
-    // second element should be a member of [0, len)
-    if (i == testData[1]) {
-      found = true;
-      break;
+    for (int i = 0; i < len * LOOP_NUM; ++i) {
+        val += 10;
    }
-  }

-  if (!found) {
-    printf("atomicExch failed\n");
-    return false;
-  }
-
-  val = -(1 << 8);
-
-  for (int i = 0; i < len; ++i) {
-    // third element should be len-1
-    val = max(val, i);
-  }
-
-  if (val != testData[2]) {
-    printf("atomicMax failed\n");
-    return false;
-  }
-
-  val = 1 << 8;
-
-  for (int i = 0; i < len; ++i) {
-    val = min(val, i);
-  }
-
-  if (val != testData[3]) {
-    printf("atomicMin failed\n");
-    return false;
-  }
-
-  int limit = 17;
-  val = 0;
-
-  for (int i = 0; i < len * LOOP_NUM; ++i) {
-    val = (val >= limit) ? 0 : val + 1;
-  }
-
-  if (val != testData[4]) {
-    printf("atomicInc failed\n");
-    return false;
-  }
-
-  limit = 137;
-  val = 0;
-
-  for (int i = 0; i < len * LOOP_NUM; ++i) {
-    val = ((val == 0) || (val > limit)) ? limit : val - 1;
-  }
-
-  if (val != testData[5]) {
-    printf("atomicDec failed\n");
-    return false;
-  }
-
-  found = false;
-
-  for (int i = 0; i < len; ++i) {
-    // seventh element should be a member of [0, len)
-    if (i == testData[6]) {
-      found = true;
-      break;
+    if (val != testData[0]) {
+        printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
+        return false;
    }
-  }

-  if (!found) {
-    printf("atomicCAS failed\n");
-    return false;
-  }
+    val = 0;

-  val = 0xff;
+    bool found = false;

-  for (int i = 0; i < len; ++i) {
-    // 8th element should be 1
-    val &= (2 * i + 7);
-  }
+    for (int i = 0; i < len; ++i) {
+        // second element should be a member of [0, len)
+        if (i == testData[1]) {
+            found = true;
+            break;
+        }
+    }

-  if (val != testData[7]) {
-    printf("atomicAnd failed\n");
-    return false;
-  }
+    if (!found) {
+        printf("atomicExch failed\n");
+        return false;
+    }

-  val = 0;
+    val = -(1 << 8);

-  for (int i = 0; i < len; ++i) {
-    // 9th element should be 0xff
-    val |= (1 << i);
-  }
+    for (int i = 0; i < len; ++i) {
+        // third element should be len-1
+        val = max(val, i);
+    }

-  if (val != testData[8]) {
-    printf("atomicOr failed\n");
-    return false;
-  }
+    if (val != testData[2]) {
+        printf("atomicMax failed\n");
+        return false;
+    }

-  val = 0xff;
+    val = 1 << 8;

-  for (int i = 0; i < len; ++i) {
-    // 11th element should be 0xff
-    val ^= i;
-  }
+    for (int i = 0; i < len; ++i) {
+        val = min(val, i);
+    }

-  if (val != testData[9]) {
-    printf("atomicXor failed\n");
-    return false;
-  }
+    if (val != testData[3]) {
+        printf("atomicMin failed\n");
+        return false;
+    }

-  return true;
+    int limit = 17;
+    val       = 0;
+
+    for (int i = 0; i < len * LOOP_NUM; ++i) {
+        val = (val >= limit) ? 0 : val + 1;
+    }
+
+    if (val != testData[4]) {
+        printf("atomicInc failed\n");
+        return false;
+    }
+
+    limit = 137;
+    val   = 0;
+
+    for (int i = 0; i < len * LOOP_NUM; ++i) {
+        val = ((val == 0) || (val > limit)) ? limit : val - 1;
+    }
+
+    if (val != testData[5]) {
+        printf("atomicDec failed\n");
+        return false;
+    }
+
+    found = false;
+
+    for (int i = 0; i < len; ++i) {
+        // seventh element should be a member of [0, len)
+        if (i == testData[6]) {
+            found = true;
+            break;
+        }
+    }
+
+    if (!found) {
+        printf("atomicCAS failed\n");
+        return false;
+    }
+
+    val = 0xff;
+
+    for (int i = 0; i < len; ++i) {
+        // 8th element should be 1
+        val &= (2 * i + 7);
+    }
+
+    if (val != testData[7]) {
+        printf("atomicAnd failed\n");
+        return false;
+    }
+
+    val = 0;
+
+    for (int i = 0; i < len; ++i) {
+        // 9th element should be 0xff
+        val |= (1 << i);
+    }
+
+    if (val != testData[8]) {
+        printf("atomicOr failed\n");
+        return false;
+    }
+
+    val = 0xff;
+
+    for (int i = 0; i < len; ++i) {
+        // 11th element should be 0xff
+        val ^= i;
+    }
+
+    if (val != testData[9]) {
+        printf("atomicXor failed\n");
+        return false;
+    }
+
+    return true;
 }

-int main(int argc, char **argv) {
-  // set device
-  cudaDeviceProp device_prop;
-  int dev_id = findCudaDevice(argc, (const char **)argv);
-  checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
+int main(int argc, char **argv)
+{
+    // set device
+    cudaDeviceProp device_prop;
+    int            dev_id = findCudaDevice(argc, (const char **)argv);
+    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));

-  if (!device_prop.managedMemory) {
-    // This samples requires being run on a device that supports Unified Memory
-    fprintf(stderr, "Unified Memory not supported on this device\n");
-    exit(EXIT_WAIVED);
-  }
+    if (!device_prop.managedMemory) {
+        // This samples requires being run on a device that supports Unified Memory
+        fprintf(stderr, "Unified Memory not supported on this device\n");
+        exit(EXIT_WAIVED);
+    }

-  if (device_prop.computeMode == cudaComputeModeProhibited) {
-    // This sample requires being run with a default or process exclusive mode
-    fprintf(stderr,
-            "This sample requires a device in either default or process "
-            "exclusive mode\n");
-    exit(EXIT_WAIVED);
-  }
+    if (device_prop.computeMode == cudaComputeModeProhibited) {
+        // This sample requires being run with a default or process exclusive mode
+        fprintf(stderr,
+                "This sample requires a device in either default or process "
+                "exclusive mode\n");
+        exit(EXIT_WAIVED);
+    }

-  if (device_prop.major < 6) {
-    printf(
-        "%s: requires a minimum CUDA compute 6.0 capability, waiving "
-        "testing.\n",
-        argv[0]);
-    exit(EXIT_WAIVED);
-  }
+    if (device_prop.major < 6) {
+        printf("%s: requires a minimum CUDA compute 6.0 capability, waiving "
+               "testing.\n",
+               argv[0]);
+        exit(EXIT_WAIVED);
+    }

-  unsigned int numThreads = 256;
-  unsigned int numBlocks = 64;
-  unsigned int numData = 10;
+    unsigned int numThreads = 256;
+    unsigned int numBlocks  = 64;
+    unsigned int numData    = 10;

-  int *atom_arr;
+    int *atom_arr;

-  if (device_prop.pageableMemoryAccess) {
-    printf("CAN access pageable memory\n");
-    atom_arr = (int *)malloc(sizeof(int) * numData);
-  } else {
-    printf("CANNOT access pageable memory\n");
-    checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
-  }
+    if (device_prop.pageableMemoryAccess) {
+        printf("CAN access pageable memory\n");
+        atom_arr = (int *)malloc(sizeof(int) * numData);
+    }
+    else {
+        printf("CANNOT access pageable memory\n");
+        checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
+    }

-  for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0;
+    for (unsigned int i = 0; i < numData; i++)
+        atom_arr[i] = 0;

-  // To make the AND and XOR tests generate something other than 0...
-  atom_arr[7] = atom_arr[9] = 0xff;
+    // To make the AND and XOR tests generate something other than 0...
+    atom_arr[7] = atom_arr[9] = 0xff;

-  atomicKernel<<<numBlocks, numThreads>>>(atom_arr);
-  atomicKernel_CPU(atom_arr, numBlocks * numThreads);
+    atomicKernel<<<numBlocks, numThreads>>>(atom_arr);
+    atomicKernel_CPU(atom_arr, numBlocks * numThreads);

-  checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaDeviceSynchronize());

-  // Compute & verify reference solution
-  int testResult = verify(atom_arr, 2 * numThreads * numBlocks);
+    // Compute & verify reference solution
+    int testResult = verify(atom_arr, 2 * numThreads * numBlocks);

-  if (device_prop.pageableMemoryAccess) {
-    free(atom_arr);
-  } else {
-    cudaFree(atom_arr);
-  }
+    if (device_prop.pageableMemoryAccess) {
+        free(atom_arr);
+    }
+    else {
+        cudaFree(atom_arr);
+    }

-  printf("systemWideAtomics completed, returned %s \n",
-         testResult ? "OK" : "ERROR!");
-  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    printf("systemWideAtomics completed, returned %s \n", testResult ? "OK" : "ERROR!");
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/template/template.cu
+++ b/Samples/0_Introduction/template/template.cu
@ -31,10 +31,10 @@
 */

 // includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 // includes CUDA
 #include <cuda_runtime.h>
@ -47,34 +47,34 @@
 // declaration, forward
 void runTest(int argc, char **argv);

-extern "C" void computeGold(float *reference, float *idata,
-                            const unsigned int len);
+extern "C" void computeGold(float *reference, float *idata, const unsigned int len);

 ////////////////////////////////////////////////////////////////////////////////
 //! Simple test kernel for device functionality
 //! @param g_idata  input data in global memory
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void testKernel(float *g_idata, float *g_odata) {
-  // shared memory
-  // the size is determined by the host application
-  extern __shared__ float sdata[];
+__global__ void testKernel(float *g_idata, float *g_odata)
+{
+    // shared memory
+    // the size is determined by the host application
+    extern __shared__ float sdata[];

-  // access thread id
-  const unsigned int tid = threadIdx.x;
-  // access number of threads in this block
-  const unsigned int num_threads = blockDim.x;
+    // access thread id
+    const unsigned int tid = threadIdx.x;
+    // access number of threads in this block
+    const unsigned int num_threads = blockDim.x;

-  // read in input data from global memory
-  sdata[tid] = g_idata[tid];
-  __syncthreads();
+    // read in input data from global memory
+    sdata[tid] = g_idata[tid];
+    __syncthreads();

-  // perform some computations
-  sdata[tid] = (float)num_threads * sdata[tid];
-  __syncthreads();
+    // perform some computations
+    sdata[tid] = (float)num_threads * sdata[tid];
+    __syncthreads();

-  // write data to global memory
-  g_odata[tid] = sdata[tid];
+    // write data to global memory
+    g_odata[tid] = sdata[tid];
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -85,81 +85,81 @@ int main(int argc, char **argv) { runTest(argc, argv); }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
-  bool bTestResult = true;
+void runTest(int argc, char **argv)
+{
+    bool bTestResult = true;

-  printf("%s Starting...\n\n", argv[0]);
+    printf("%s Starting...\n\n", argv[0]);

-  // use command-line specified CUDA device, otherwise use device with highest
-  // Gflops/s
-  int devID = findCudaDevice(argc, (const char **)argv);
+    // use command-line specified CUDA device, otherwise use device with highest
+    // Gflops/s
+    int devID = findCudaDevice(argc, (const char **)argv);

-  StopWatchInterface *timer = 0;
-  sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
+    StopWatchInterface *timer = 0;
+    sdkCreateTimer(&timer);
+    sdkStartTimer(&timer);

-  unsigned int num_threads = 32;
-  unsigned int mem_size = sizeof(float) * num_threads;
+    unsigned int num_threads = 32;
+    unsigned int mem_size    = sizeof(float) * num_threads;

-  // allocate host memory
-  float *h_idata = (float *)malloc(mem_size);
+    // allocate host memory
+    float *h_idata = (float *)malloc(mem_size);

-  // initalize the memory
-  for (unsigned int i = 0; i < num_threads; ++i) {
-    h_idata[i] = (float)i;
-  }
+    // initalize the memory
+    for (unsigned int i = 0; i < num_threads; ++i) {
+        h_idata[i] = (float)i;
+    }

-  // allocate device memory
-  float *d_idata;
-  checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
-  // copy host memory to device
-  checkCudaErrors(
-      cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
+    // allocate device memory
+    float *d_idata;
+    checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
+    // copy host memory to device
+    checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));

-  // allocate device memory for result
-  float *d_odata;
-  checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));
+    // allocate device memory for result
+    float *d_odata;
+    checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));

-  // setup execution parameters
-  dim3 grid(1, 1, 1);
-  dim3 threads(num_threads, 1, 1);
+    // setup execution parameters
+    dim3 grid(1, 1, 1);
+    dim3 threads(num_threads, 1, 1);

-  // execute the kernel
-  testKernel<<<grid, threads, mem_size>>>(d_idata, d_odata);
+    // execute the kernel
+    testKernel<<<grid, threads, mem_size>>>(d_idata, d_odata);

-  // check if kernel execution generated and error
-  getLastCudaError("Kernel execution failed");
+    // check if kernel execution generated and error
+    getLastCudaError("Kernel execution failed");

-  // allocate mem for the result on host side
-  float *h_odata = (float *)malloc(mem_size);
-  // copy result from device to host
-  checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads,
-                             cudaMemcpyDeviceToHost));
+    // allocate mem for the result on host side
+    float *h_odata = (float *)malloc(mem_size);
+    // copy result from device to host
+    checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, cudaMemcpyDeviceToHost));

-  sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  sdkDeleteTimer(&timer);
+    sdkStopTimer(&timer);
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    sdkDeleteTimer(&timer);

-  // compute reference solution
-  float *reference = (float *)malloc(mem_size);
-  computeGold(reference, h_idata, num_threads);
+    // compute reference solution
+    float *reference = (float *)malloc(mem_size);
+    computeGold(reference, h_idata, num_threads);

-  // check result
-  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
-    // write file for regression test
-    sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false);
-  } else {
-    // custom output handling when no regression test running
-    // in this case check if the result is equivalent to the expected solution
-    bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
-  }
+    // check result
+    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
+        // write file for regression test
+        sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false);
+    }
+    else {
+        // custom output handling when no regression test running
+        // in this case check if the result is equivalent to the expected solution
+        bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
+    }

-  // cleanup memory
-  free(h_idata);
-  free(h_odata);
-  free(reference);
-  checkCudaErrors(cudaFree(d_idata));
-  checkCudaErrors(cudaFree(d_odata));
+    // cleanup memory
+    free(h_idata);
+    free(h_odata);
+    free(reference);
+    checkCudaErrors(cudaFree(d_idata));
+    checkCudaErrors(cudaFree(d_odata));

-  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/template/template_cpu.cpp
+++ b/Samples/0_Introduction/template/template_cpu.cpp
@ -26,8 +26,7 @@
 */

 // export C interface
-extern "C" void computeGold(float *reference, float *idata,
-                            const unsigned int len);
+extern "C" void computeGold(float *reference, float *idata, const unsigned int len);

 ////////////////////////////////////////////////////////////////////////////////
 //! Compute reference data set
@ -36,10 +35,11 @@ extern "C" void computeGold(float *reference, float *idata,
 //! @param idata      input data as provided to device
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
-void computeGold(float *reference, float *idata, const unsigned int len) {
-  const float f_len = static_cast<float>(len);
+void computeGold(float *reference, float *idata, const unsigned int len)
+{
+    const float f_len = static_cast<float>(len);

-  for (unsigned int i = 0; i < len; ++i) {
-    reference[i] = idata[i] * f_len;
-  }
+    for (unsigned int i = 0; i < len; ++i) {
+        reference[i] = idata[i] * f_len;
+    }
 }
--- a/Samples/0_Introduction/vectorAdd/vectorAdd.cu
+++ b/Samples/0_Introduction/vectorAdd/vectorAdd.cu
@ -37,7 +37,6 @@

 // For the CUDA runtime routines (prefixed with "cuda_")
 #include <cuda_runtime.h>
-
 #include <helper_cuda.h>
 /**
 * CUDA Kernel Device code
@ -45,166 +44,153 @@
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
-__global__ void vectorAdd(const float *A, const float *B, float *C,
-                          int numElements) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
+__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;

-  if (i < numElements) {
-    C[i] = A[i] + B[i] + 0.0f;
-  }
+    if (i < numElements) {
+        C[i] = A[i] + B[i] + 0.0f;
+    }
 }

 /**
 * Host main routine
 */
-int main(void) {
-  // Error code to check return values for CUDA calls
-  cudaError_t err = cudaSuccess;
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;

-  // Print the vector length to be used, and compute its size
-  int numElements = 50000;
-  size_t size = numElements * sizeof(float);
-  printf("[Vector addition of %d elements]\n", numElements);
+    // Print the vector length to be used, and compute its size
+    int    numElements = 50000;
+    size_t size        = numElements * sizeof(float);
+    printf("[Vector addition of %d elements]\n", numElements);

-  // Allocate the host input vector A
-  float *h_A = (float *)malloc(size);
+    // Allocate the host input vector A
+    float *h_A = (float *)malloc(size);

-  // Allocate the host input vector B
-  float *h_B = (float *)malloc(size);
+    // Allocate the host input vector B
+    float *h_B = (float *)malloc(size);

-  // Allocate the host output vector C
-  float *h_C = (float *)malloc(size);
+    // Allocate the host output vector C
+    float *h_C = (float *)malloc(size);

-  // Verify that allocations succeeded
-  if (h_A == NULL || h_B == NULL || h_C == NULL) {
-    fprintf(stderr, "Failed to allocate host vectors!\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Initialize the host input vectors
-  for (int i = 0; i < numElements; ++i) {
-    h_A[i] = rand() / (float)RAND_MAX;
-    h_B[i] = rand() / (float)RAND_MAX;
-  }
-
-  // Allocate the device input vector A
-  float *d_A = NULL;
-  err = cudaMalloc((void **)&d_A, size);
-
-  if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-
-  // Allocate the device input vector B
-  float *d_B = NULL;
-  err = cudaMalloc((void **)&d_B, size);
-
-  if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-
-  // Allocate the device output vector C
-  float *d_C = NULL;
-  err = cudaMalloc((void **)&d_C, size);
-
-  if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-
-  // Copy the host input vectors A and B in host memory to the device input
-  // vectors in
-  // device memory
-  printf("Copy input data from the host memory to the CUDA device\n");
-  err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
-
-  if (err != cudaSuccess) {
-    fprintf(stderr,
-            "Failed to copy vector A from host to device (error code %s)!\n",
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-
-  err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
-
-  if (err != cudaSuccess) {
-    fprintf(stderr,
-            "Failed to copy vector B from host to device (error code %s)!\n",
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-
-  // Launch the Vector Add CUDA Kernel
-  int threadsPerBlock = 256;
-  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
-  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
-         threadsPerBlock);
-  vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
-  err = cudaGetLastError();
-
-  if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-
-  // Copy the device result vector in device memory to the host result vector
-  // in host memory.
-  printf("Copy output data from the CUDA device to the host memory\n");
-  err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
-
-  if (err != cudaSuccess) {
-    fprintf(stderr,
-            "Failed to copy vector C from device to host (error code %s)!\n",
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-
-  // Verify that the result vector is correct
-  for (int i = 0; i < numElements; ++i) {
-    if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
-      fprintf(stderr, "Result verification failed at element %d!\n", i);
-      exit(EXIT_FAILURE);
+    // Verify that allocations succeeded
+    if (h_A == NULL || h_B == NULL || h_C == NULL) {
+        fprintf(stderr, "Failed to allocate host vectors!\n");
+        exit(EXIT_FAILURE);
    }
-  }

-  printf("Test PASSED\n");
+    // Initialize the host input vectors
+    for (int i = 0; i < numElements; ++i) {
+        h_A[i] = rand() / (float)RAND_MAX;
+        h_B[i] = rand() / (float)RAND_MAX;
+    }

-  // Free device global memory
-  err = cudaFree(d_A);
+    // Allocate the device input vector A
+    float *d_A = NULL;
+    err        = cudaMalloc((void **)&d_A, size);

-  if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }

-  err = cudaFree(d_B);
+    // Allocate the device input vector B
+    float *d_B = NULL;
+    err        = cudaMalloc((void **)&d_B, size);

-  if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }

-  err = cudaFree(d_C);
+    // Allocate the device output vector C
+    float *d_C = NULL;
+    err        = cudaMalloc((void **)&d_C, size);

-  if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }

-  // Free host memory
-  free(h_A);
-  free(h_B);
-  free(h_C);
+    // Copy the host input vectors A and B in host memory to the device input
+    // vectors in
+    // device memory
+    printf("Copy input data from the host memory to the CUDA device\n");
+    err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

-  printf("Done\n");
-  return 0;
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
+
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid   = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
+    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Copy the device result vector in device memory to the host result vector
+    // in host memory.
+    printf("Copy output data from the CUDA device to the host memory\n");
+    err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
+
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Verify that the result vector is correct
+    for (int i = 0; i < numElements; ++i) {
+        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
+            fprintf(stderr, "Result verification failed at element %d!\n", i);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    printf("Test PASSED\n");
+
+    // Free device global memory
+    err = cudaFree(d_A);
+
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    err = cudaFree(d_B);
+
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    err = cudaFree(d_C);
+
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Free host memory
+    free(h_A);
+    free(h_B);
+    free(h_C);
+
+    printf("Done\n");
+    return 0;
 }
--- a/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp
+++ b/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp
@ -34,11 +34,11 @@
 */

 // Includes
-#include <stdio.h>
-#include <string.h>
-#include <iostream>
 #include <cstring>
 #include <cuda.h>
+#include <iostream>
+#include <stdio.h>
+#include <string.h>

 // includes, project
 #include <helper_cuda_drvapi.h>
@ -50,19 +50,19 @@
 using namespace std;

 // Variables
-CUdevice cuDevice;
-CUcontext cuContext;
-CUmodule cuModule;
-CUfunction vecAdd_kernel;
-float *h_A;
-float *h_B;
-float *h_C;
+CUdevice    cuDevice;
+CUcontext   cuContext;
+CUmodule    cuModule;
+CUfunction  vecAdd_kernel;
+float      *h_A;
+float      *h_B;
+float      *h_C;
 CUdeviceptr d_A;
 CUdeviceptr d_B;
 CUdeviceptr d_C;

 // Functions
-int CleanupNoFailure();
+int  CleanupNoFailure();
 void RandomInit(float *, int);
 bool findModulePath(const char *, string &, char **, string &);

@ -72,150 +72,152 @@ bool findModulePath(const char *, string &, char **, string &);
 #endif

 // Host code
-int main(int argc, char **argv) {
-  printf("Vector Addition (Driver API)\n");
-  int N = 50000, devID = 0;
-  size_t size = N * sizeof(float);
+int main(int argc, char **argv)
+{
+    printf("Vector Addition (Driver API)\n");
+    int    N = 50000, devID = 0;
+    size_t size = N * sizeof(float);

-  // Initialize
-  checkCudaErrors(cuInit(0));
+    // Initialize
+    checkCudaErrors(cuInit(0));

-  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
-  // Create context
-  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
+    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
+    // Create context
+    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));

-  // first search for the module path before we load the results
-  string module_path;
+    // first search for the module path before we load the results
+    string module_path;

-  std::ostringstream fatbin;
+    std::ostringstream fatbin;

-  if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
-    exit(EXIT_FAILURE);
-  } else {
-    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
-  }
+    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
+        exit(EXIT_FAILURE);
+    }
+    else {
+        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
+    }

-  if (!fatbin.str().size()) {
-    printf("fatbin file empty. exiting..\n");
-    exit(EXIT_FAILURE);
-  }
+    if (!fatbin.str().size()) {
+        printf("fatbin file empty. exiting..\n");
+        exit(EXIT_FAILURE);
+    }

-  // Create module from binary file (FATBIN)
-  checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
+    // Create module from binary file (FATBIN)
+    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));

-  // Get function handle from module
-  checkCudaErrors(
-      cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
+    // Get function handle from module
+    checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));

-  // Allocate input vectors h_A and h_B in host memory
-  h_A = (float *)malloc(size);
-  h_B = (float *)malloc(size);
-  h_C = (float *)malloc(size);
+    // Allocate input vectors h_A and h_B in host memory
+    h_A = (float *)malloc(size);
+    h_B = (float *)malloc(size);
+    h_C = (float *)malloc(size);

-  // Initialize input vectors
-  RandomInit(h_A, N);
-  RandomInit(h_B, N);
+    // Initialize input vectors
+    RandomInit(h_A, N);
+    RandomInit(h_B, N);

-  // Allocate vectors in device memory
-  checkCudaErrors(cuMemAlloc(&d_A, size));
+    // Allocate vectors in device memory
+    checkCudaErrors(cuMemAlloc(&d_A, size));

-  checkCudaErrors(cuMemAlloc(&d_B, size));
+    checkCudaErrors(cuMemAlloc(&d_B, size));

-  checkCudaErrors(cuMemAlloc(&d_C, size));
+    checkCudaErrors(cuMemAlloc(&d_C, size));

-  // Copy vectors from host memory to device memory
-  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
+    // Copy vectors from host memory to device memory
+    checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));

-  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
+    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));

-  if (1) {
-    // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
-    // Launch (simpler method)
+    if (1) {
+        // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
+        // Launch (simpler method)

-    // Grid/Block configuration
-    int threadsPerBlock = 256;
-    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
+        // Grid/Block configuration
+        int threadsPerBlock = 256;
+        int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;

-    void *args[] = {&d_A, &d_B, &d_C, &N};
+        void *args[] = {&d_A, &d_B, &d_C, &N};

-    // Launch the CUDA kernel
-    checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
-                                   threadsPerBlock, 1, 1, 0, NULL, args, NULL));
-  } else {
-    // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
-    // Launch (advanced method)
-    int offset = 0;
-    void *argBuffer[16];
-    *((CUdeviceptr *)&argBuffer[offset]) = d_A;
-    offset += sizeof(d_A);
-    *((CUdeviceptr *)&argBuffer[offset]) = d_B;
-    offset += sizeof(d_B);
-    *((CUdeviceptr *)&argBuffer[offset]) = d_C;
-    offset += sizeof(d_C);
-    *((int *)&argBuffer[offset]) = N;
-    offset += sizeof(N);
+        // Launch the CUDA kernel
+        checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
+    }
+    else {
+        // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
+        // Launch (advanced method)
+        int   offset = 0;
+        void *argBuffer[16];
+        *((CUdeviceptr *)&argBuffer[offset]) = d_A;
+        offset += sizeof(d_A);
+        *((CUdeviceptr *)&argBuffer[offset]) = d_B;
+        offset += sizeof(d_B);
+        *((CUdeviceptr *)&argBuffer[offset]) = d_C;
+        offset += sizeof(d_C);
+        *((int *)&argBuffer[offset]) = N;
+        offset += sizeof(N);

-    // Grid/Block configuration
-    int threadsPerBlock = 256;
-    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
+        // Grid/Block configuration
+        int threadsPerBlock = 256;
+        int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;

-    // Launch the CUDA kernel
-    checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
-                                   threadsPerBlock, 1, 1, 0, NULL, NULL,
-                                   argBuffer));
-  }
+        // Launch the CUDA kernel
+        checkCudaErrors(
+            cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, argBuffer));
+    }

 #ifdef _DEBUG
-  checkCudaErrors(cuCtxSynchronize());
+    checkCudaErrors(cuCtxSynchronize());
 #endif

-  // Copy result from device memory to host memory
-  // h_C contains the result in host memory
-  checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
+    // Copy result from device memory to host memory
+    // h_C contains the result in host memory
+    checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));

-  // Verify result
-  int i;
+    // Verify result
+    int i;

-  for (i = 0; i < N; ++i) {
-    float sum = h_A[i] + h_B[i];
+    for (i = 0; i < N; ++i) {
+        float sum = h_A[i] + h_B[i];

-    if (fabs(h_C[i] - sum) > 1e-7f) {
-      break;
+        if (fabs(h_C[i] - sum) > 1e-7f) {
+            break;
+        }
    }
-  }

-  CleanupNoFailure();
-  printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
+    CleanupNoFailure();
+    printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");

-  exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 }

-int CleanupNoFailure() {
-  // Free device memory
-  checkCudaErrors(cuMemFree(d_A));
-  checkCudaErrors(cuMemFree(d_B));
-  checkCudaErrors(cuMemFree(d_C));
+int CleanupNoFailure()
+{
+    // Free device memory
+    checkCudaErrors(cuMemFree(d_A));
+    checkCudaErrors(cuMemFree(d_B));
+    checkCudaErrors(cuMemFree(d_C));

-  // Free host memory
-  if (h_A) {
-    free(h_A);
-  }
+    // Free host memory
+    if (h_A) {
+        free(h_A);
+    }

-  if (h_B) {
-    free(h_B);
-  }
+    if (h_B) {
+        free(h_B);
+    }

-  if (h_C) {
-    free(h_C);
-  }
+    if (h_C) {
+        free(h_C);
+    }

-  checkCudaErrors(cuCtxDestroy(cuContext));
+    checkCudaErrors(cuCtxDestroy(cuContext));

-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
 // Allocates an array with random float entries.
-void RandomInit(float *data, int n) {
-  for (int i = 0; i < n; ++i) {
-    data[i] = rand() / (float)RAND_MAX;
-  }
+void RandomInit(float *data, int n)
+{
+    for (int i = 0; i < n; ++i) {
+        data[i] = rand() / (float)RAND_MAX;
+    }
 }
--- a/Samples/0_Introduction/vectorAddDrv/vectorAdd_kernel.cu
+++ b/Samples/0_Introduction/vectorAddDrv/vectorAdd_kernel.cu
@ -33,9 +33,10 @@
 */

 // Device code
-extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
-                                         float *C, int N) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
+extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;

-  if (i < N) C[i] = A[i] + B[i];
+    if (i < N)
+        C[i] = A[i] + B[i];
 }
--- a/Samples/0_Introduction/vectorAddMMAP/README.md
+++ b/Samples/0_Introduction/vectorAddMMAP/README.md
@ -30,4 +30,3 @@ cuMemcpyDtoH, cuDeviceCanAccessPeer, cuModuleGetFunction, cuMemSetAccess, cuMemR
 Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
-
--- a/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.cpp
+++ b/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.cpp
@ -29,172 +29,172 @@

 static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; }

-CUresult simpleMallocMultiDeviceMmap(
-    CUdeviceptr *dptr, size_t *allocationSize, size_t size,
-    const std::vector<CUdevice> &residentDevices,
-    const std::vector<CUdevice> &mappingDevices, size_t align) {
-  CUresult status = CUDA_SUCCESS;
-  size_t min_granularity = 0;
-  size_t stripeSize;
+CUresult simpleMallocMultiDeviceMmap(CUdeviceptr                 *dptr,
+                                     size_t                      *allocationSize,
+                                     size_t                       size,
+                                     const std::vector<CUdevice> &residentDevices,
+                                     const std::vector<CUdevice> &mappingDevices,
+                                     size_t                       align)
+{
+    CUresult status          = CUDA_SUCCESS;
+    size_t   min_granularity = 0;
+    size_t   stripeSize;

-  // Setup the properties common for all the chunks
-  // The allocations will be device pinned memory.
-  // This property structure describes the physical location where the memory
-  // will be allocated via cuMemCreate allong with additional properties In this
-  // case, the allocation will be pinnded device memory local to a given device.
-  CUmemAllocationProp prop = {};
-  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    // Setup the properties common for all the chunks
+    // The allocations will be device pinned memory.
+    // This property structure describes the physical location where the memory
+    // will be allocated via cuMemCreate allong with additional properties In this
+    // case, the allocation will be pinnded device memory local to a given device.
+    CUmemAllocationProp prop = {};
+    prop.type                = CU_MEM_ALLOCATION_TYPE_PINNED;
+    prop.location.type       = CU_MEM_LOCATION_TYPE_DEVICE;

-  // Get the minimum granularity needed for the resident devices
-  // (the max of the minimum granularity of each participating device)
-  for (int idx = 0; idx < residentDevices.size(); idx++) {
-    size_t granularity = 0;
+    // Get the minimum granularity needed for the resident devices
+    // (the max of the minimum granularity of each participating device)
+    for (int idx = 0; idx < residentDevices.size(); idx++) {
+        size_t granularity = 0;

-    // get the minnimum granularity for residentDevices[idx]
-    prop.location.id = residentDevices[idx];
-    status = cuMemGetAllocationGranularity(&granularity, &prop,
-                                           CU_MEM_ALLOC_GRANULARITY_MINIMUM);
-    if (status != CUDA_SUCCESS) {
-      goto done;
-    }
-    if (min_granularity < granularity) {
-      min_granularity = granularity;
-    }
-  }
-
-  // Get the minimum granularity needed for the accessing devices
-  // (the max of the minimum granularity of each participating device)
-  for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
-    size_t granularity = 0;
-
-    // get the minnimum granularity for mappingDevices[idx]
-    prop.location.id = mappingDevices[idx];
-    status = cuMemGetAllocationGranularity(&granularity, &prop,
-                                           CU_MEM_ALLOC_GRANULARITY_MINIMUM);
-    if (status != CUDA_SUCCESS) {
-      goto done;
-    }
-    if (min_granularity < granularity) {
-      min_granularity = granularity;
-    }
-  }
-
-  // Round up the size such that we can evenly split it into a stripe size tha
-  // meets the granularity requirements Essentially size = N *
-  // residentDevices.size() * min_granularity is the requirement, since each
-  // piece of the allocation will be stripeSize = N * min_granularity and the
-  // min_granularity requirement applies to each stripeSize piece of the
-  // allocation.
-  size = round_up(size, residentDevices.size() * min_granularity);
-  stripeSize = size / residentDevices.size();
-
-  // Return the rounded up size to the caller for use in the free
-  if (allocationSize) {
-    *allocationSize = size;
-  }
-
-  // Reserve the required contiguous VA space for the allocations
-  status = cuMemAddressReserve(dptr, size, align, 0, 0);
-  if (status != CUDA_SUCCESS) {
-    goto done;
-  }
-
-  // Create and map the backings on each gpu
-  // note: reusing CUmemAllocationProp prop from earlier with prop.type &
-  // prop.location.type already specified.
-  for (size_t idx = 0; idx < residentDevices.size(); idx++) {
-    CUresult status2 = CUDA_SUCCESS;
-
-    // Set the location for this chunk to this device
-    prop.location.id = residentDevices[idx];
-
-    // Create the allocation as a pinned allocation on this device
-    CUmemGenericAllocationHandle allocationHandle;
-    status = cuMemCreate(&allocationHandle, stripeSize, &prop, 0);
-    if (status != CUDA_SUCCESS) {
-      goto done;
+        // get the minnimum granularity for residentDevices[idx]
+        prop.location.id = residentDevices[idx];
+        status           = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+        if (status != CUDA_SUCCESS) {
+            goto done;
+        }
+        if (min_granularity < granularity) {
+            min_granularity = granularity;
+        }
    }

-    // Assign the chunk to the appropriate VA range and release the handle.
-    // After mapping the memory, it can be referenced by virtual address.
-    // Since we do not need to make any other mappings of this memory or export
-    // it, we no longer need and can release the allocationHandle. The
-    // allocation will be kept live until it is unmapped.
-    status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0,
-                      allocationHandle, 0);
-
-    // the handle needs to be released even if the mapping failed.
-    status2 = cuMemRelease(allocationHandle);
-    if (status == CUDA_SUCCESS) {
-      // cuMemRelease should not have failed here
-      // as the handle was just allocated successfully
-      // however return an error if it does.
-      status = status2;
-    }
-
-    // Cleanup in case of any mapping failures.
-    if (status != CUDA_SUCCESS) {
-      goto done;
-    }
-  }
-
-  {
-    // Each accessDescriptor will describe the mapping requirement for a single
-    // device
-    std::vector<CUmemAccessDesc> accessDescriptors;
-    accessDescriptors.resize(mappingDevices.size());
-
-    // Prepare the access descriptor array indicating where and how the backings
-    // should be visible.
+    // Get the minimum granularity needed for the accessing devices
+    // (the max of the minimum granularity of each participating device)
    for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
-      // Specify which device we are adding mappings for.
-      accessDescriptors[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-      accessDescriptors[idx].location.id = mappingDevices[idx];
+        size_t granularity = 0;

-      // Specify both read and write access.
-      accessDescriptors[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        // get the minnimum granularity for mappingDevices[idx]
+        prop.location.id = mappingDevices[idx];
+        status           = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+        if (status != CUDA_SUCCESS) {
+            goto done;
+        }
+        if (min_granularity < granularity) {
+            min_granularity = granularity;
+        }
    }

-    // Apply the access descriptors to the whole VA range.
-    status = cuMemSetAccess(*dptr, size, &accessDescriptors[0],
-                            accessDescriptors.size());
+    // Round up the size such that we can evenly split it into a stripe size tha
+    // meets the granularity requirements Essentially size = N *
+    // residentDevices.size() * min_granularity is the requirement, since each
+    // piece of the allocation will be stripeSize = N * min_granularity and the
+    // min_granularity requirement applies to each stripeSize piece of the
+    // allocation.
+    size       = round_up(size, residentDevices.size() * min_granularity);
+    stripeSize = size / residentDevices.size();
+
+    // Return the rounded up size to the caller for use in the free
+    if (allocationSize) {
+        *allocationSize = size;
+    }
+
+    // Reserve the required contiguous VA space for the allocations
+    status = cuMemAddressReserve(dptr, size, align, 0, 0);
    if (status != CUDA_SUCCESS) {
-      goto done;
+        goto done;
+    }
+
+    // Create and map the backings on each gpu
+    // note: reusing CUmemAllocationProp prop from earlier with prop.type &
+    // prop.location.type already specified.
+    for (size_t idx = 0; idx < residentDevices.size(); idx++) {
+        CUresult status2 = CUDA_SUCCESS;
+
+        // Set the location for this chunk to this device
+        prop.location.id = residentDevices[idx];
+
+        // Create the allocation as a pinned allocation on this device
+        CUmemGenericAllocationHandle allocationHandle;
+        status = cuMemCreate(&allocationHandle, stripeSize, &prop, 0);
+        if (status != CUDA_SUCCESS) {
+            goto done;
+        }
+
+        // Assign the chunk to the appropriate VA range and release the handle.
+        // After mapping the memory, it can be referenced by virtual address.
+        // Since we do not need to make any other mappings of this memory or export
+        // it, we no longer need and can release the allocationHandle. The
+        // allocation will be kept live until it is unmapped.
+        status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0, allocationHandle, 0);
+
+        // the handle needs to be released even if the mapping failed.
+        status2 = cuMemRelease(allocationHandle);
+        if (status == CUDA_SUCCESS) {
+            // cuMemRelease should not have failed here
+            // as the handle was just allocated successfully
+            // however return an error if it does.
+            status = status2;
+        }
+
+        // Cleanup in case of any mapping failures.
+        if (status != CUDA_SUCCESS) {
+            goto done;
+        }
+    }
+
+    {
+        // Each accessDescriptor will describe the mapping requirement for a single
+        // device
+        std::vector<CUmemAccessDesc> accessDescriptors;
+        accessDescriptors.resize(mappingDevices.size());
+
+        // Prepare the access descriptor array indicating where and how the backings
+        // should be visible.
+        for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
+            // Specify which device we are adding mappings for.
+            accessDescriptors[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+            accessDescriptors[idx].location.id   = mappingDevices[idx];
+
+            // Specify both read and write access.
+            accessDescriptors[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        }
+
+        // Apply the access descriptors to the whole VA range.
+        status = cuMemSetAccess(*dptr, size, &accessDescriptors[0], accessDescriptors.size());
+        if (status != CUDA_SUCCESS) {
+            goto done;
+        }
    }
-  }

 done:
-  if (status != CUDA_SUCCESS) {
-    if (*dptr) {
-      simpleFreeMultiDeviceMmap(*dptr, size);
+    if (status != CUDA_SUCCESS) {
+        if (*dptr) {
+            simpleFreeMultiDeviceMmap(*dptr, size);
+        }
    }
-  }

-  return status;
+    return status;
 }

-CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size) {
-  CUresult status = CUDA_SUCCESS;
+CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size)
+{
+    CUresult status = CUDA_SUCCESS;

-  // Unmap the mapped virtual memory region
-  // Since the handles to the mapped backing stores have already been released
-  // by cuMemRelease, and these are the only/last mappings referencing them,
-  // The backing stores will be freed.
-  // Since the memory has been unmapped after this call, accessing the specified
-  // va range will result in a fault (unitll it is remapped).
-  status = cuMemUnmap(dptr, size);
-  if (status != CUDA_SUCCESS) {
-    return status;
-  }
-  // Free the virtual address region.  This allows the virtual address region
-  // to be reused by future cuMemAddressReserve calls.  This also allows the
-  // virtual address region to be used by other allocation made through
-  // opperating system calls like malloc & mmap.
-  status = cuMemAddressFree(dptr, size);
-  if (status != CUDA_SUCCESS) {
-    return status;
-  }
+    // Unmap the mapped virtual memory region
+    // Since the handles to the mapped backing stores have already been released
+    // by cuMemRelease, and these are the only/last mappings referencing them,
+    // The backing stores will be freed.
+    // Since the memory has been unmapped after this call, accessing the specified
+    // va range will result in a fault (unitll it is remapped).
+    status = cuMemUnmap(dptr, size);
+    if (status != CUDA_SUCCESS) {
+        return status;
+    }
+    // Free the virtual address region.  This allows the virtual address region
+    // to be reused by future cuMemAddressReserve calls.  This also allows the
+    // virtual address region to be used by other allocation made through
+    // opperating system calls like malloc & mmap.
+    status = cuMemAddressFree(dptr, size);
+    if (status != CUDA_SUCCESS) {
+        return status;
+    }

-  return status;
+    return status;
 }
--- a/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.hpp
+++ b/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.hpp
@ -63,10 +63,12 @@
 //! handle
 //!   is not needed after its mappings are set up.
 ////////////////////////////////////////////////////////////////////////////
-CUresult simpleMallocMultiDeviceMmap(
-    CUdeviceptr *dptr, size_t *allocationSize, size_t size,
-    const std::vector<CUdevice> &residentDevices,
-    const std::vector<CUdevice> &mappingDevices, size_t align = 0);
+CUresult simpleMallocMultiDeviceMmap(CUdeviceptr                 *dptr,
+                                     size_t                      *allocationSize,
+                                     size_t                       size,
+                                     const std::vector<CUdevice> &residentDevices,
+                                     const std::vector<CUdevice> &mappingDevices,
+                                     size_t                       align = 0);

 ////////////////////////////////////////////////////////////////////////////
 //! Frees resources allocated by simpleMallocMultiDeviceMmap
--- a/Samples/0_Introduction/vectorAddMMAP/vectorAddMMAP.cpp
+++ b/Samples/0_Introduction/vectorAddMMAP/vectorAddMMAP.cpp
@ -36,11 +36,11 @@
 */

 // Includes
+#include <cstring>
 #include <cuda.h>
+#include <iostream>
 #include <stdio.h>
 #include <string.h>
-#include <cstring>
-#include <iostream>

 // includes, project
 #include <helper_cuda_drvapi.h>
@ -54,115 +54,111 @@
 using namespace std;

 // Variables
-CUdevice cuDevice;
-CUcontext cuContext;
-CUmodule cuModule;
-CUfunction vecAdd_kernel;
-float *h_A;
-float *h_B;
-float *h_C;
+CUdevice    cuDevice;
+CUcontext   cuContext;
+CUmodule    cuModule;
+CUfunction  vecAdd_kernel;
+float      *h_A;
+float      *h_B;
+float      *h_C;
 CUdeviceptr d_A;
 CUdeviceptr d_B;
 CUdeviceptr d_C;
-size_t allocationSize = 0;
+size_t      allocationSize = 0;

 // Functions
-int CleanupNoFailure();
+int  CleanupNoFailure();
 void RandomInit(float *, int);

-//define input fatbin file
+// define input fatbin file
 #ifndef FATBIN_FILE
 #define FATBIN_FILE "vectorAdd_kernel64.fatbin"
 #endif

 // collect all of the devices whose memory can be mapped from cuDevice.
-vector<CUdevice> getBackingDevices(CUdevice cuDevice) {
-  int num_devices;
+vector<CUdevice> getBackingDevices(CUdevice cuDevice)
+{
+    int num_devices;

-  checkCudaErrors(cuDeviceGetCount(&num_devices));
+    checkCudaErrors(cuDeviceGetCount(&num_devices));

-  vector<CUdevice> backingDevices;
-  backingDevices.push_back(cuDevice);
-  for (int dev = 0; dev < num_devices; dev++) {
-    int capable = 0;
-    int attributeVal = 0;
+    vector<CUdevice> backingDevices;
+    backingDevices.push_back(cuDevice);
+    for (int dev = 0; dev < num_devices; dev++) {
+        int capable      = 0;
+        int attributeVal = 0;

-    // The mapping device is already in the backingDevices vector
-    if (dev == cuDevice) {
-      continue;
+        // The mapping device is already in the backingDevices vector
+        if (dev == cuDevice) {
+            continue;
+        }
+
+        // Only peer capable devices can map each others memory
+        checkCudaErrors(cuDeviceCanAccessPeer(&capable, cuDevice, dev));
+        if (!capable) {
+            continue;
+        }
+
+        // The device needs to support virtual address management for the required
+        // apis to work
+        checkCudaErrors(
+            cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
+        if (attributeVal == 0) {
+            continue;
+        }
+
+        backingDevices.push_back(dev);
    }
-
-    // Only peer capable devices can map each others memory
-    checkCudaErrors(cuDeviceCanAccessPeer(&capable, cuDevice, dev));
-    if (!capable) {
-      continue;
-    }
-
-    // The device needs to support virtual address management for the required
-    // apis to work
-    checkCudaErrors(cuDeviceGetAttribute(
-        &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
-        cuDevice));
-    if (attributeVal == 0) {
-      continue;
-    }
-
-    backingDevices.push_back(dev);
-  }
-  return backingDevices;
+    return backingDevices;
 }

 // Host code
-int main(int argc, char **argv) {
-  printf("Vector Addition (Driver API)\n");
-  int N = 50000;
-  size_t size = N * sizeof(float);
-  int attributeVal = 0;
+int main(int argc, char **argv)
+{
+    printf("Vector Addition (Driver API)\n");
+    int    N            = 50000;
+    size_t size         = N * sizeof(float);
+    int    attributeVal = 0;

-  // Initialize
-  checkCudaErrors(cuInit(0));
+    // Initialize
+    checkCudaErrors(cuInit(0));

-  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
+    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);

-  // Check that the selected device supports virtual address management
-  checkCudaErrors(cuDeviceGetAttribute(
-      &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
-      cuDevice));
-  printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice,
-         attributeVal);
-  if (attributeVal == 0) {
-    printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice);
-    exit(EXIT_WAIVED);
-  }
+    // Check that the selected device supports virtual address management
+    checkCudaErrors(
+        cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
+    printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice, attributeVal);
+    if (attributeVal == 0) {
+        printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice);
+        exit(EXIT_WAIVED);
+    }

-  // The vector addition happens on cuDevice, so the allocations need to be
-  // mapped there.
-  vector<CUdevice> mappingDevices;
-  mappingDevices.push_back(cuDevice);
+    // The vector addition happens on cuDevice, so the allocations need to be
+    // mapped there.
+    vector<CUdevice> mappingDevices;
+    mappingDevices.push_back(cuDevice);

-  // Collect devices accessible by the mapping device (cuDevice) into the
-  // backingDevices vector.
-  vector<CUdevice> backingDevices = getBackingDevices(cuDevice);
+    // Collect devices accessible by the mapping device (cuDevice) into the
+    // backingDevices vector.
+    vector<CUdevice> backingDevices = getBackingDevices(cuDevice);

-  // Create context
-  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
+    // Create context
+    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));

    // first search for the module path before we load the results
    string module_path;

    std::ostringstream fatbin;

-    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
-    {
+    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
        exit(EXIT_FAILURE);
    }
-    else
-    {
+    else {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }

-    if (!fatbin.str().size())
-    {
+    if (!fatbin.str().size()) {
        printf("fatbin file empty. exiting..\n");
        exit(EXIT_FAILURE);
    }
@ -204,13 +200,10 @@ int main(int argc, char **argv) {
    int threadsPerBlock = 256;
    int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;

-    void *args[] = { &d_A, &d_B, &d_C, &N };
+    void *args[] = {&d_A, &d_B, &d_C, &N};

    // Launch the CUDA kernel
-    checkCudaErrors(cuLaunchKernel(vecAdd_kernel,  blocksPerGrid, 1, 1,
-                               threadsPerBlock, 1, 1,
-                               0,
-                               NULL, args, NULL));
+    checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));

    // Copy result from device memory to host memory
    // h_C contains the result in host memory
@ -219,20 +212,18 @@ int main(int argc, char **argv) {
    // Verify result
    int i;

-    for (i = 0; i < N; ++i)
-    {
+    for (i = 0; i < N; ++i) {
        float sum = h_A[i] + h_B[i];

-        if (fabs(h_C[i] - sum) > 1e-7f)
-        {
+        if (fabs(h_C[i] - sum) > 1e-7f) {
            break;
        }
    }

    CleanupNoFailure();
-    printf("%s\n", (i==N) ? "Result = PASS" : "Result = FAIL");
+    printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");

-    exit((i==N) ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 }

 int CleanupNoFailure()
@ -243,18 +234,15 @@ int CleanupNoFailure()
    checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize));

    // Free host memory
-    if (h_A)
-    {
+    if (h_A) {
        free(h_A);
    }

-    if (h_B)
-    {
+    if (h_B) {
        free(h_B);
    }

-    if (h_C)
-    {
+    if (h_C) {
        free(h_C);
    }

@ -265,8 +253,7 @@ int CleanupNoFailure()
 // Allocates an array with random float entries.
 void RandomInit(float *data, int n)
 {
-    for (int i = 0; i < n; ++i)
-    {
+    for (int i = 0; i < n; ++i) {
        data[i] = rand() / (float)RAND_MAX;
    }
 }
--- a/Samples/0_Introduction/vectorAddMMAP/vectorAdd_kernel.cu
+++ b/Samples/0_Introduction/vectorAddMMAP/vectorAdd_kernel.cu
@ -34,9 +34,10 @@
 */

 // Device code
-extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
-                                         float *C, int N) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
+extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;

-  if (i < N) C[i] = A[i] + B[i];
+    if (i < N)
+        C[i] = A[i] + B[i];
 }
--- a/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd.cpp
+++ b/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd.cpp
@ -33,8 +33,8 @@
 * of the programming guide with some additions like error checking.
 */

-#include <stdio.h>
 #include <cmath>
+#include <stdio.h>

 // For the CUDA runtime routines (prefixed with "cuda_")
 #include <cuda.h>
@ -42,112 +42,116 @@

 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
-
 #include <nvrtc_helper.h>

 /**
 * Host main routine
 */
-int main(int argc, char **argv) {
-  char *cubin, *kernel_file;
-  size_t cubinSize;
-  kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
-  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
-  CUmodule module = loadCUBIN(cubin, argc, argv);
+int main(int argc, char **argv)
+{
+    char  *cubin, *kernel_file;
+    size_t cubinSize;
+    kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
+    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
+    CUmodule module = loadCUBIN(cubin, argc, argv);

-  CUfunction kernel_addr;
-  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd"));
+    CUfunction kernel_addr;
+    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd"));

-  // Print the vector length to be used, and compute its size
-  int numElements = 50000;
-  size_t size = numElements * sizeof(float);
-  printf("[Vector addition of %d elements]\n", numElements);
+    // Print the vector length to be used, and compute its size
+    int    numElements = 50000;
+    size_t size        = numElements * sizeof(float);
+    printf("[Vector addition of %d elements]\n", numElements);

-  // Allocate the host input vector A
-  float *h_A = reinterpret_cast<float *>(malloc(size));
+    // Allocate the host input vector A
+    float *h_A = reinterpret_cast<float *>(malloc(size));

-  // Allocate the host input vector B
-  float *h_B = reinterpret_cast<float *>(malloc(size));
+    // Allocate the host input vector B
+    float *h_B = reinterpret_cast<float *>(malloc(size));

-  // Allocate the host output vector C
-  float *h_C = reinterpret_cast<float *>(malloc(size));
+    // Allocate the host output vector C
+    float *h_C = reinterpret_cast<float *>(malloc(size));

-  // Verify that allocations succeeded
-  if (h_A == NULL || h_B == NULL || h_C == NULL) {
-    fprintf(stderr, "Failed to allocate host vectors!\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Initialize the host input vectors
-  for (int i = 0; i < numElements; ++i) {
-    h_A[i] = rand() / static_cast<float>(RAND_MAX);
-    h_B[i] = rand() / static_cast<float>(RAND_MAX);
-  }
-
-  // Allocate the device input vector A
-  CUdeviceptr d_A;
-  checkCudaErrors(cuMemAlloc(&d_A, size));
-
-  // Allocate the device input vector B
-  CUdeviceptr d_B;
-  checkCudaErrors(cuMemAlloc(&d_B, size));
-
-  // Allocate the device output vector C
-  CUdeviceptr d_C;
-  checkCudaErrors(cuMemAlloc(&d_C, size));
-
-  // Copy the host input vectors A and B in host memory to the device input
-  // vectors in device memory
-  printf("Copy input data from the host memory to the CUDA device\n");
-  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
-  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
-
-  // Launch the Vector Add CUDA Kernel
-  int threadsPerBlock = 256;
-  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
-  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
-         threadsPerBlock);
-  dim3 cudaBlockSize(threadsPerBlock, 1, 1);
-  dim3 cudaGridSize(blocksPerGrid, 1, 1);
-
-  void *arr[] = {reinterpret_cast<void *>(&d_A), reinterpret_cast<void *>(&d_B),
-                 reinterpret_cast<void *>(&d_C),
-                 reinterpret_cast<void *>(&numElements)};
-  checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
-                                 cudaGridSize.z, /* grid dim */
-                                 cudaBlockSize.x, cudaBlockSize.y,
-                                 cudaBlockSize.z, /* block dim */
-                                 0, 0,            /* shared mem, stream */
-                                 &arr[0],         /* arguments */
-                                 0));
-  checkCudaErrors(cuCtxSynchronize());
-
-  // Copy the device result vector in device memory to the host result vector
-  // in host memory.
-  printf("Copy output data from the CUDA device to the host memory\n");
-  checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
-
-  // Verify that the result vector is correct
-  for (int i = 0; i < numElements; ++i) {
-    if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
-      fprintf(stderr, "Result verification failed at element %d!\n", i);
-      exit(EXIT_FAILURE);
+    // Verify that allocations succeeded
+    if (h_A == NULL || h_B == NULL || h_C == NULL) {
+        fprintf(stderr, "Failed to allocate host vectors!\n");
+        exit(EXIT_FAILURE);
    }
-  }

-  printf("Test PASSED\n");
+    // Initialize the host input vectors
+    for (int i = 0; i < numElements; ++i) {
+        h_A[i] = rand() / static_cast<float>(RAND_MAX);
+        h_B[i] = rand() / static_cast<float>(RAND_MAX);
+    }

-  // Free device global memory
-  checkCudaErrors(cuMemFree(d_A));
-  checkCudaErrors(cuMemFree(d_B));
-  checkCudaErrors(cuMemFree(d_C));
+    // Allocate the device input vector A
+    CUdeviceptr d_A;
+    checkCudaErrors(cuMemAlloc(&d_A, size));

-  // Free host memory
-  free(h_A);
-  free(h_B);
-  free(h_C);
+    // Allocate the device input vector B
+    CUdeviceptr d_B;
+    checkCudaErrors(cuMemAlloc(&d_B, size));

-  printf("Done\n");
+    // Allocate the device output vector C
+    CUdeviceptr d_C;
+    checkCudaErrors(cuMemAlloc(&d_C, size));

-  return 0;
+    // Copy the host input vectors A and B in host memory to the device input
+    // vectors in device memory
+    printf("Copy input data from the host memory to the CUDA device\n");
+    checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
+    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid   = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
+    dim3 cudaBlockSize(threadsPerBlock, 1, 1);
+    dim3 cudaGridSize(blocksPerGrid, 1, 1);
+
+    void *arr[] = {reinterpret_cast<void *>(&d_A),
+                   reinterpret_cast<void *>(&d_B),
+                   reinterpret_cast<void *>(&d_C),
+                   reinterpret_cast<void *>(&numElements)};
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
+                                   cudaGridSize.x,
+                                   cudaGridSize.y,
+                                   cudaGridSize.z, /* grid dim */
+                                   cudaBlockSize.x,
+                                   cudaBlockSize.y,
+                                   cudaBlockSize.z, /* block dim */
+                                   0,
+                                   0,       /* shared mem, stream */
+                                   &arr[0], /* arguments */
+                                   0));
+    checkCudaErrors(cuCtxSynchronize());
+
+    // Copy the device result vector in device memory to the host result vector
+    // in host memory.
+    printf("Copy output data from the CUDA device to the host memory\n");
+    checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
+
+    // Verify that the result vector is correct
+    for (int i = 0; i < numElements; ++i) {
+        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
+            fprintf(stderr, "Result verification failed at element %d!\n", i);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    printf("Test PASSED\n");
+
+    // Free device global memory
+    checkCudaErrors(cuMemFree(d_A));
+    checkCudaErrors(cuMemFree(d_B));
+    checkCudaErrors(cuMemFree(d_C));
+
+    // Free host memory
+    free(h_A);
+    free(h_B);
+    free(h_C);
+
+    printf("Done\n");
+
+    return 0;
 }
--- a/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd_kernel.cu
+++ b/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd_kernel.cu
@ -32,11 +32,11 @@
 * number of elements numElements.
 */

-extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
-                                     int numElements) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
+extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;

-  if (i < numElements) {
-    C[i] = A[i] + B[i];
-  }
+    if (i < numElements) {
+        C[i] = A[i] + B[i];
+    }
 }
--- a/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu
+++ b/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu
--- a/Samples/1_Utilities/deviceQuery/deviceQuery.cpp
+++ b/Samples/1_Utilities/deviceQuery/deviceQuery.cpp
@ -32,12 +32,11 @@

 #include <cuda_runtime.h>
 #include <helper_cuda.h>
-
 #include <iostream>
 #include <memory>
 #include <string>

-int *pArgc = NULL;
+int   *pArgc = NULL;
 char **pArgv = NULL;

 #if CUDART_VERSION < 5000
@ -46,19 +45,16 @@ char **pArgv = NULL;
 #include <cuda.h>

 // This function wraps the CUDA Driver API into a template function
-template <class T>
-inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
-                             int device) {
-  CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
+template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
+{
+    CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);

-  if (CUDA_SUCCESS != error) {
-    fprintf(
-        stderr,
-        "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
-        error, __FILE__, __LINE__);
+    if (CUDA_SUCCESS != error) {
+        fprintf(
+            stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__);

-    exit(EXIT_FAILURE);
-  }
+        exit(EXIT_FAILURE);
+    }
 }

 #endif /* CUDART_VERSION < 5000 */
@ -66,278 +62,259 @@ inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  pArgc = &argc;
-  pArgv = argv;
+int main(int argc, char **argv)
+{
+    pArgc = &argc;
+    pArgv = argv;

-  printf("%s Starting...\n\n", argv[0]);
-  printf(
-      " CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
+    printf("%s Starting...\n\n", argv[0]);
+    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

-  int deviceCount = 0;
-  cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+    int         deviceCount = 0;
+    cudaError_t error_id    = cudaGetDeviceCount(&deviceCount);

-  if (error_id != cudaSuccess) {
-    printf("cudaGetDeviceCount returned %d\n-> %s\n",
-           static_cast<int>(error_id), cudaGetErrorString(error_id));
-    printf("Result = FAIL\n");
-    exit(EXIT_FAILURE);
-  }
+    if (error_id != cudaSuccess) {
+        printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast<int>(error_id), cudaGetErrorString(error_id));
+        printf("Result = FAIL\n");
+        exit(EXIT_FAILURE);
+    }

-  // This function call returns 0 if there are no CUDA capable devices.
-  if (deviceCount == 0) {
-    printf("There are no available device(s) that support CUDA\n");
-  } else {
-    printf("Detected %d CUDA Capable device(s)\n", deviceCount);
-  }
+    // This function call returns 0 if there are no CUDA capable devices.
+    if (deviceCount == 0) {
+        printf("There are no available device(s) that support CUDA\n");
+    }
+    else {
+        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
+    }

-  int dev, driverVersion = 0, runtimeVersion = 0;
+    int dev, driverVersion = 0, runtimeVersion = 0;

-  for (dev = 0; dev < deviceCount; ++dev) {
-    cudaSetDevice(dev);
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, dev);
+    for (dev = 0; dev < deviceCount; ++dev) {
+        cudaSetDevice(dev);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, dev);

-    printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
+        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

-    // Console log
-    cudaDriverGetVersion(&driverVersion);
-    cudaRuntimeGetVersion(&runtimeVersion);
-    printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
-           driverVersion / 1000, (driverVersion % 100) / 10,
-           runtimeVersion / 1000, (runtimeVersion % 100) / 10);
-    printf("  CUDA Capability Major/Minor version number:    %d.%d\n",
-           deviceProp.major, deviceProp.minor);
+        // Console log
+        cudaDriverGetVersion(&driverVersion);
+        cudaRuntimeGetVersion(&runtimeVersion);
+        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
+               driverVersion / 1000,
+               (driverVersion % 100) / 10,
+               runtimeVersion / 1000,
+               (runtimeVersion % 100) / 10);
+        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

-    char msg[256];
+        char msg[256];
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    sprintf_s(msg, sizeof(msg),
-              "  Total amount of global memory:                 %.0f MBytes "
-              "(%llu bytes)\n",
-              static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
-              (unsigned long long)deviceProp.totalGlobalMem);
+        sprintf_s(msg,
+                  sizeof(msg),
+                  "  Total amount of global memory:                 %.0f MBytes "
+                  "(%llu bytes)\n",
+                  static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
+                  (unsigned long long)deviceProp.totalGlobalMem);
 #else
-    snprintf(msg, sizeof(msg),
-             "  Total amount of global memory:                 %.0f MBytes "
-             "(%llu bytes)\n",
-             static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
-             (unsigned long long)deviceProp.totalGlobalMem);
+        snprintf(msg,
+                 sizeof(msg),
+                 "  Total amount of global memory:                 %.0f MBytes "
+                 "(%llu bytes)\n",
+                 static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
+                 (unsigned long long)deviceProp.totalGlobalMem);
 #endif
-    printf("%s", msg);
+        printf("%s", msg);

-    printf("  (%03d) Multiprocessors, (%03d) CUDA Cores/MP:    %d CUDA Cores\n",
-           deviceProp.multiProcessorCount,
-           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
-           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
-               deviceProp.multiProcessorCount);
-    printf(
-        "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
-        "GHz)\n",
-        deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
+        printf("  (%03d) Multiprocessors, (%03d) CUDA Cores/MP:    %d CUDA Cores\n",
+               deviceProp.multiProcessorCount,
+               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
+               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
+        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f "
+               "GHz)\n",
+               deviceProp.clockRate * 1e-3f,
+               deviceProp.clockRate * 1e-6f);

 #if CUDART_VERSION >= 5000
-    // This is supported in CUDA 5.0 (runtime API device properties)
-    printf("  Memory Clock rate:                             %.0f Mhz\n",
-           deviceProp.memoryClockRate * 1e-3f);
-    printf("  Memory Bus Width:                              %d-bit\n",
-           deviceProp.memoryBusWidth);
+        // This is supported in CUDA 5.0 (runtime API device properties)
+        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
+        printf("  Memory Bus Width:                              %d-bit\n", deviceProp.memoryBusWidth);

-    if (deviceProp.l2CacheSize) {
-      printf("  L2 Cache Size:                                 %d bytes\n",
-             deviceProp.l2CacheSize);
-    }
-
-#else
-    // This only available in CUDA 4.0-4.2 (but these were only exposed in the
-    // CUDA Driver API)
-    int memoryClock;
-    getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                          dev);
-    printf("  Memory Clock rate:                             %.0f Mhz\n",
-           memoryClock * 1e-3f);
-    int memBusWidth;
-    getCudaAttribute<int>(&memBusWidth,
-                          CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
-    printf("  Memory Bus Width:                              %d-bit\n",
-           memBusWidth);
-    int L2CacheSize;
-    getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
-
-    if (L2CacheSize) {
-      printf("  L2 Cache Size:                                 %d bytes\n",
-             L2CacheSize);
-    }
-
-#endif
-
-    printf(
-        "  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
-        "%d), 3D=(%d, %d, %d)\n",
-        deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],
-        deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],
-        deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
-    printf(
-        "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
-        deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
-    printf(
-        "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
-        "layers\n",
-        deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1],
-        deviceProp.maxTexture2DLayered[2]);
-
-    printf("  Total amount of constant memory:               %zu bytes\n",
-           deviceProp.totalConstMem);
-    printf("  Total amount of shared memory per block:       %zu bytes\n",
-           deviceProp.sharedMemPerBlock);
-    printf("  Total shared memory per multiprocessor:        %zu bytes\n",
-           deviceProp.sharedMemPerMultiprocessor);
-    printf("  Total number of registers available per block: %d\n",
-           deviceProp.regsPerBlock);
-    printf("  Warp size:                                     %d\n",
-           deviceProp.warpSize);
-    printf("  Maximum number of threads per multiprocessor:  %d\n",
-           deviceProp.maxThreadsPerMultiProcessor);
-    printf("  Maximum number of threads per block:           %d\n",
-           deviceProp.maxThreadsPerBlock);
-    printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
-           deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
-           deviceProp.maxThreadsDim[2]);
-    printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
-           deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
-           deviceProp.maxGridSize[2]);
-    printf("  Maximum memory pitch:                          %zu bytes\n",
-           deviceProp.memPitch);
-    printf("  Texture alignment:                             %zu bytes\n",
-           deviceProp.textureAlignment);
-    printf(
-        "  Concurrent copy and kernel execution:          %s with %d copy "
-        "engine(s)\n",
-        (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
-    printf("  Run time limit on kernels:                     %s\n",
-           deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
-    printf("  Integrated GPU sharing Host Memory:            %s\n",
-           deviceProp.integrated ? "Yes" : "No");
-    printf("  Support host page-locked memory mapping:       %s\n",
-           deviceProp.canMapHostMemory ? "Yes" : "No");
-    printf("  Alignment requirement for Surfaces:            %s\n",
-           deviceProp.surfaceAlignment ? "Yes" : "No");
-    printf("  Device has ECC support:                        %s\n",
-           deviceProp.ECCEnabled ? "Enabled" : "Disabled");
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
-           deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)"
-                                : "WDDM (Windows Display Driver Model)");
-#endif
-    printf("  Device supports Unified Addressing (UVA):      %s\n",
-           deviceProp.unifiedAddressing ? "Yes" : "No");
-    printf("  Device supports Managed Memory:                %s\n",
-           deviceProp.managedMemory ? "Yes" : "No");
-    printf("  Device supports Compute Preemption:            %s\n",
-           deviceProp.computePreemptionSupported ? "Yes" : "No");
-    printf("  Supports Cooperative Kernel Launch:            %s\n",
-           deviceProp.cooperativeLaunch ? "Yes" : "No");
-    printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
-           deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
-    printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
-           deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
-
-    const char *sComputeMode[] = {
-        "Default (multiple host threads can use ::cudaSetDevice() with device "
-        "simultaneously)",
-        "Exclusive (only one host thread in one process is able to use "
-        "::cudaSetDevice() with this device)",
-        "Prohibited (no host thread can use ::cudaSetDevice() with this "
-        "device)",
-        "Exclusive Process (many threads in one process is able to use "
-        "::cudaSetDevice() with this device)",
-        "Unknown", NULL};
-    printf("  Compute Mode:\n");
-    printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
-  }
-
-  // If there are 2 or more GPUs, query to determine whether RDMA is supported
-  if (deviceCount >= 2) {
-    cudaDeviceProp prop[64];
-    int gpuid[64];  // we want to find the first two GPUs that can support P2P
-    int gpu_p2p_count = 0;
-
-    for (int i = 0; i < deviceCount; i++) {
-      checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
-
-      // Only boards based on Fermi or later can support P2P
-      if ((prop[i].major >= 2)
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-          // on Windows (64-bit), the Tesla Compute Cluster driver for windows
-          // must be enabled to support this
-          && prop[i].tccDriver
-#endif
-          ) {
-        // This is an array of P2P capable GPUs
-        gpuid[gpu_p2p_count++] = i;
-      }
-    }
-
-    // Show all the combinations of support P2P GPUs
-    int can_access_peer;
-
-    if (gpu_p2p_count >= 2) {
-      for (int i = 0; i < gpu_p2p_count; i++) {
-        for (int j = 0; j < gpu_p2p_count; j++) {
-          if (gpuid[i] == gpuid[j]) {
-            continue;
-          }
-          checkCudaErrors(
-              cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
-          printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
-                 prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],
-                 can_access_peer ? "Yes" : "No");
+        if (deviceProp.l2CacheSize) {
+            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
        }
-      }
+
+#else
+        // This only available in CUDA 4.0-4.2 (but these were only exposed in the
+        // CUDA Driver API)
+        int memoryClock;
+        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
+        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
+        int memBusWidth;
+        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
+        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
+        int L2CacheSize;
+        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
+
+        if (L2CacheSize) {
+            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
+        }
+
+#endif
+
+        printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
+               "%d), 3D=(%d, %d, %d)\n",
+               deviceProp.maxTexture1D,
+               deviceProp.maxTexture2D[0],
+               deviceProp.maxTexture2D[1],
+               deviceProp.maxTexture3D[0],
+               deviceProp.maxTexture3D[1],
+               deviceProp.maxTexture3D[2]);
+        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
+               deviceProp.maxTexture1DLayered[0],
+               deviceProp.maxTexture1DLayered[1]);
+        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
+               "layers\n",
+               deviceProp.maxTexture2DLayered[0],
+               deviceProp.maxTexture2DLayered[1],
+               deviceProp.maxTexture2DLayered[2]);
+
+        printf("  Total amount of constant memory:               %zu bytes\n", deviceProp.totalConstMem);
+        printf("  Total amount of shared memory per block:       %zu bytes\n", deviceProp.sharedMemPerBlock);
+        printf("  Total shared memory per multiprocessor:        %zu bytes\n", deviceProp.sharedMemPerMultiprocessor);
+        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
+        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
+        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
+        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
+        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
+               deviceProp.maxThreadsDim[0],
+               deviceProp.maxThreadsDim[1],
+               deviceProp.maxThreadsDim[2]);
+        printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
+               deviceProp.maxGridSize[0],
+               deviceProp.maxGridSize[1],
+               deviceProp.maxGridSize[2]);
+        printf("  Maximum memory pitch:                          %zu bytes\n", deviceProp.memPitch);
+        printf("  Texture alignment:                             %zu bytes\n", deviceProp.textureAlignment);
+        printf("  Concurrent copy and kernel execution:          %s with %d copy "
+               "engine(s)\n",
+               (deviceProp.deviceOverlap ? "Yes" : "No"),
+               deviceProp.asyncEngineCount);
+        printf("  Run time limit on kernels:                     %s\n",
+               deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
+        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
+        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
+        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
+        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
+               deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
+#endif
+        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
+        printf("  Device supports Managed Memory:                %s\n", deviceProp.managedMemory ? "Yes" : "No");
+        printf("  Device supports Compute Preemption:            %s\n",
+               deviceProp.computePreemptionSupported ? "Yes" : "No");
+        printf("  Supports Cooperative Kernel Launch:            %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No");
+        printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
+               deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
+        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
+               deviceProp.pciDomainID,
+               deviceProp.pciBusID,
+               deviceProp.pciDeviceID);
+
+        const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
+                                      "simultaneously)",
+                                      "Exclusive (only one host thread in one process is able to use "
+                                      "::cudaSetDevice() with this device)",
+                                      "Prohibited (no host thread can use ::cudaSetDevice() with this "
+                                      "device)",
+                                      "Exclusive Process (many threads in one process is able to use "
+                                      "::cudaSetDevice() with this device)",
+                                      "Unknown",
+                                      NULL};
+        printf("  Compute Mode:\n");
+        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
    }
-  }

-  // csv masterlog info
-  // *****************************
-  // exe and CUDA driver name
-  printf("\n");
-  std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
-  char cTemp[16];
+    // If there are 2 or more GPUs, query to determine whether RDMA is supported
+    if (deviceCount >= 2) {
+        cudaDeviceProp prop[64];
+        int            gpuid[64]; // we want to find the first two GPUs that can support P2P
+        int            gpu_p2p_count = 0;

-  // driver version
-  sProfileString += ", CUDA Driver Version = ";
+        for (int i = 0; i < deviceCount; i++) {
+            checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
+
+            // Only boards based on Fermi or later can support P2P
+            if ((prop[i].major >= 2)
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-  sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000,
-            (driverVersion % 100) / 10);
-#else
-  snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
-           (driverVersion % 100) / 10);
+                // on Windows (64-bit), the Tesla Compute Cluster driver for windows
+                // must be enabled to support this
+                && prop[i].tccDriver
 #endif
-  sProfileString += cTemp;
+            ) {
+                // This is an array of P2P capable GPUs
+                gpuid[gpu_p2p_count++] = i;
+            }
+        }

-  // Runtime version
-  sProfileString += ", CUDA Runtime Version = ";
+        // Show all the combinations of support P2P GPUs
+        int can_access_peer;
+
+        if (gpu_p2p_count >= 2) {
+            for (int i = 0; i < gpu_p2p_count; i++) {
+                for (int j = 0; j < gpu_p2p_count; j++) {
+                    if (gpuid[i] == gpuid[j]) {
+                        continue;
+                    }
+                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
+                    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
+                           prop[gpuid[i]].name,
+                           gpuid[i],
+                           prop[gpuid[j]].name,
+                           gpuid[j],
+                           can_access_peer ? "Yes" : "No");
+                }
+            }
+        }
+    }
+
+    // csv masterlog info
+    // *****************************
+    // exe and CUDA driver name
+    printf("\n");
+    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
+    char        cTemp[16];
+
+    // driver version
+    sProfileString += ", CUDA Driver Version = ";
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-  sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000,
-            (runtimeVersion % 100) / 10);
+    sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
 #else
-  snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
-           (runtimeVersion % 100) / 10);
+    snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
 #endif
-  sProfileString += cTemp;
+    sProfileString += cTemp;

-  // Device count
-  sProfileString += ", NumDevs = ";
+    // Runtime version
+    sProfileString += ", CUDA Runtime Version = ";
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-  sprintf_s(cTemp, 10, "%d", deviceCount);
+    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
 #else
-  snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
+    snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
 #endif
-  sProfileString += cTemp;
-  sProfileString += "\n";
-  printf("%s", sProfileString.c_str());
+    sProfileString += cTemp;

-  printf("Result = PASS\n");
+    // Device count
+    sProfileString += ", NumDevs = ";
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    sprintf_s(cTemp, 10, "%d", deviceCount);
+#else
+    snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
+#endif
+    sProfileString += cTemp;
+    sProfileString += "\n";
+    printf("%s", sProfileString.c_str());

-  // finish
-  exit(EXIT_SUCCESS);
+    printf("Result = PASS\n");
+
+    // finish
+    exit(EXIT_SUCCESS);
 }
--- a/Samples/1_Utilities/deviceQueryDrv/deviceQueryDrv.cpp
+++ b/Samples/1_Utilities/deviceQueryDrv/deviceQueryDrv.cpp
@ -30,358 +30,295 @@
 */

 // includes, system
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
 #include <cuda.h>
 #include <helper_cuda_drvapi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
-  CUdevice dev;
-  int major = 0, minor = 0;
-  int deviceCount = 0;
-  char deviceName[256];
+int main(int argc, char **argv)
+{
+    CUdevice dev;
+    int      major = 0, minor = 0;
+    int      deviceCount = 0;
+    char     deviceName[256];

-  printf("%s Starting...\n\n", argv[0]);
+    printf("%s Starting...\n\n", argv[0]);

-  // note your project will need to link with cuda.lib files on windows
-  printf("CUDA Device Query (Driver API) statically linked version \n");
+    // note your project will need to link with cuda.lib files on windows
+    printf("CUDA Device Query (Driver API) statically linked version \n");

-  checkCudaErrors(cuInit(0));
+    checkCudaErrors(cuInit(0));

-  checkCudaErrors(cuDeviceGetCount(&deviceCount));
+    checkCudaErrors(cuDeviceGetCount(&deviceCount));

-  // This function call returns 0 if there are no CUDA capable devices.
-  if (deviceCount == 0) {
-    printf("There are no available device(s) that support CUDA\n");
-  } else {
-    printf("Detected %d CUDA Capable device(s)\n", deviceCount);
-  }
-
-  for (dev = 0; dev < deviceCount; ++dev) {
-    checkCudaErrors(cuDeviceGetAttribute(
-        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
-    checkCudaErrors(cuDeviceGetAttribute(
-        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
-
-    checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));
-
-    printf("\nDevice %d: \"%s\"\n", dev, deviceName);
-
-    int driverVersion = 0;
-    checkCudaErrors(cuDriverGetVersion(&driverVersion));
-    printf("  CUDA Driver Version:                           %d.%d\n",
-           driverVersion / 1000, (driverVersion % 100) / 10);
-    printf("  CUDA Capability Major/Minor version number:    %d.%d\n", major,
-           minor);
-
-    size_t totalGlobalMem;
-    checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));
-
-    char msg[256];
-    SPRINTF(msg,
-            "  Total amount of global memory:                 %.0f MBytes "
-            "(%llu bytes)\n",
-            (float)totalGlobalMem / 1048576.0f,
-            (unsigned long long)totalGlobalMem);
-    printf("%s", msg);
-
-    int multiProcessorCount;
-    getCudaAttribute<int>(&multiProcessorCount,
-                          CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
-
-    printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
-           multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor),
-           _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
-
-    int clockRate;
-    getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-    printf(
-        "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
-        "GHz)\n",
-        clockRate * 1e-3f, clockRate * 1e-6f);
-    int memoryClock;
-    getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                          dev);
-    printf("  Memory Clock rate:                             %.0f Mhz\n",
-           memoryClock * 1e-3f);
-    int memBusWidth;
-    getCudaAttribute<int>(&memBusWidth,
-                          CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
-    printf("  Memory Bus Width:                              %d-bit\n",
-           memBusWidth);
-    int L2CacheSize;
-    getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
-
-    if (L2CacheSize) {
-      printf("  L2 Cache Size:                                 %d bytes\n",
-             L2CacheSize);
+    // This function call returns 0 if there are no CUDA capable devices.
+    if (deviceCount == 0) {
+        printf("There are no available device(s) that support CUDA\n");
+    }
+    else {
+        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
    }

-    int maxTex1D, maxTex2D[2], maxTex3D[3];
-    getCudaAttribute<int>(&maxTex1D,
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
-    getCudaAttribute<int>(&maxTex2D[0],
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
-    getCudaAttribute<int>(&maxTex2D[1],
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
-    getCudaAttribute<int>(&maxTex3D[0],
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
-    getCudaAttribute<int>(&maxTex3D[1],
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
-    getCudaAttribute<int>(&maxTex3D[2],
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
-    printf(
-        "  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) "
-        "3D=(%d, %d, %d)\n",
-        maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1],
-        maxTex3D[2]);
+    for (dev = 0; dev < deviceCount; ++dev) {
+        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
+        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));

-    int maxTex1DLayered[2];
-    getCudaAttribute<int>(&maxTex1DLayered[0],
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH,
-                          dev);
-    getCudaAttribute<int>(&maxTex1DLayered[1],
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS,
-                          dev);
-    printf(
-        "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
-        maxTex1DLayered[0], maxTex1DLayered[1]);
+        checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));

-    int maxTex2DLayered[3];
-    getCudaAttribute<int>(&maxTex2DLayered[0],
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH,
-                          dev);
-    getCudaAttribute<int>(&maxTex2DLayered[1],
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT,
-                          dev);
-    getCudaAttribute<int>(&maxTex2DLayered[2],
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS,
-                          dev);
-    printf(
-        "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
-        "layers\n",
-        maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]);
+        printf("\nDevice %d: \"%s\"\n", dev, deviceName);

-    int totalConstantMemory;
-    getCudaAttribute<int>(&totalConstantMemory,
-                          CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
-    printf("  Total amount of constant memory:               %u bytes\n",
-           totalConstantMemory);
-    int sharedMemPerBlock;
-    getCudaAttribute<int>(&sharedMemPerBlock,
-                          CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
-    printf("  Total amount of shared memory per block:       %u bytes\n",
-           sharedMemPerBlock);
-    int regsPerBlock;
-    getCudaAttribute<int>(&regsPerBlock,
-                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
-    printf("  Total number of registers available per block: %d\n",
-           regsPerBlock);
-    int warpSize;
-    getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
-    printf("  Warp size:                                     %d\n", warpSize);
-    int maxThreadsPerMultiProcessor;
-    getCudaAttribute<int>(&maxThreadsPerMultiProcessor,
-                          CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
-                          dev);
-    printf("  Maximum number of threads per multiprocessor:  %d\n",
-           maxThreadsPerMultiProcessor);
-    int maxThreadsPerBlock;
-    getCudaAttribute<int>(&maxThreadsPerBlock,
-                          CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
-    printf("  Maximum number of threads per block:           %d\n",
-           maxThreadsPerBlock);
+        int driverVersion = 0;
+        checkCudaErrors(cuDriverGetVersion(&driverVersion));
+        printf("  CUDA Driver Version:                           %d.%d\n",
+               driverVersion / 1000,
+               (driverVersion % 100) / 10);
+        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", major, minor);

-    int blockDim[3];
-    getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                          dev);
-    getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                          dev);
-    getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                          dev);
-    printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
-           blockDim[0], blockDim[1], blockDim[2]);
-    int gridDim[3];
-    getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
-    getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
-    getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
-    printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)\n",
-           gridDim[0], gridDim[1], gridDim[2]);
+        size_t totalGlobalMem;
+        checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));

-    int textureAlign;
-    getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
-                          dev);
-    printf("  Texture alignment:                             %u bytes\n",
-           textureAlign);
+        char msg[256];
+        SPRINTF(msg,
+                "  Total amount of global memory:                 %.0f MBytes "
+                "(%llu bytes)\n",
+                (float)totalGlobalMem / 1048576.0f,
+                (unsigned long long)totalGlobalMem);
+        printf("%s", msg);

-    int memPitch;
-    getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
-    printf("  Maximum memory pitch:                          %u bytes\n",
-           memPitch);
+        int multiProcessorCount;
+        getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);

-    int gpuOverlap;
-    getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
+        printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
+               multiProcessorCount,
+               _ConvertSMVer2CoresDRV(major, minor),
+               _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);

-    int asyncEngineCount;
-    getCudaAttribute<int>(&asyncEngineCount,
-                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
-    printf(
-        "  Concurrent copy and kernel execution:          %s with %d copy "
-        "engine(s)\n",
-        (gpuOverlap ? "Yes" : "No"), asyncEngineCount);
+        int clockRate;
+        getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
+        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f "
+               "GHz)\n",
+               clockRate * 1e-3f,
+               clockRate * 1e-6f);
+        int memoryClock;
+        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
+        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
+        int memBusWidth;
+        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
+        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
+        int L2CacheSize;
+        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

-    int kernelExecTimeoutEnabled;
-    getCudaAttribute<int>(&kernelExecTimeoutEnabled,
-                          CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
-    printf("  Run time limit on kernels:                     %s\n",
-           kernelExecTimeoutEnabled ? "Yes" : "No");
-    int integrated;
-    getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
-    printf("  Integrated GPU sharing Host Memory:            %s\n",
-           integrated ? "Yes" : "No");
-    int canMapHostMemory;
-    getCudaAttribute<int>(&canMapHostMemory,
-                          CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
-    printf("  Support host page-locked memory mapping:       %s\n",
-           canMapHostMemory ? "Yes" : "No");
-
-    int concurrentKernels;
-    getCudaAttribute<int>(&concurrentKernels,
-                          CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
-    printf("  Concurrent kernel execution:                   %s\n",
-           concurrentKernels ? "Yes" : "No");
-
-    int surfaceAlignment;
-    getCudaAttribute<int>(&surfaceAlignment,
-                          CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
-    printf("  Alignment requirement for Surfaces:            %s\n",
-           surfaceAlignment ? "Yes" : "No");
-
-    int eccEnabled;
-    getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
-    printf("  Device has ECC support:                        %s\n",
-           eccEnabled ? "Enabled" : "Disabled");
-
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    int tccDriver;
-    getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
-    printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
-           tccDriver ? "TCC (Tesla Compute Cluster Driver)"
-                     : "WDDM (Windows Display Driver Model)");
-#endif
-
-    int unifiedAddressing;
-    getCudaAttribute<int>(&unifiedAddressing,
-                          CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
-    printf("  Device supports Unified Addressing (UVA):      %s\n",
-           unifiedAddressing ? "Yes" : "No");
-
-    int managedMemory;
-    getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
-                          dev);
-    printf("  Device supports Managed Memory:                %s\n",
-           managedMemory ? "Yes" : "No");
-
-    int computePreemption;
-    getCudaAttribute<int>(&computePreemption,
-                          CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
-                          dev);
-    printf("  Device supports Compute Preemption:            %s\n",
-           computePreemption ? "Yes" : "No");
-
-    int cooperativeLaunch;
-    getCudaAttribute<int>(&cooperativeLaunch,
-                          CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
-    printf("  Supports Cooperative Kernel Launch:            %s\n",
-           cooperativeLaunch ? "Yes" : "No");
-
-    int cooperativeMultiDevLaunch;
-    getCudaAttribute<int>(&cooperativeMultiDevLaunch,
-                          CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH,
-                          dev);
-    printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
-           cooperativeMultiDevLaunch ? "Yes" : "No");
-
-    int pciDomainID, pciBusID, pciDeviceID;
-    getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
-    getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
-    getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
-    printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
-           pciDomainID, pciBusID, pciDeviceID);
-
-    const char *sComputeMode[] = {
-        "Default (multiple host threads can use ::cudaSetDevice() with device "
-        "simultaneously)",
-        "Exclusive (only one host thread in one process is able to use "
-        "::cudaSetDevice() with this device)",
-        "Prohibited (no host thread can use ::cudaSetDevice() with this "
-        "device)",
-        "Exclusive Process (many threads in one process is able to use "
-        "::cudaSetDevice() with this device)",
-        "Unknown", NULL};
-
-    int computeMode;
-    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
-    printf("  Compute Mode:\n");
-    printf("     < %s >\n", sComputeMode[computeMode]);
-  }
-
-  // If there are 2 or more GPUs, query to determine whether RDMA is supported
-  if (deviceCount >= 2) {
-    int gpuid[64];  // we want to find the first two GPUs that can support P2P
-    int gpu_p2p_count = 0;
-    int tccDriver = 0;
-
-    for (int i = 0; i < deviceCount; i++) {
-      checkCudaErrors(cuDeviceGetAttribute(
-          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
-      checkCudaErrors(cuDeviceGetAttribute(
-          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
-      getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
-
-      // Only boards based on Fermi or later can support P2P
-      if ((major >= 2)
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-          // on Windows (64-bit), the Tesla Compute Cluster driver for windows
-          // must be enabled to support this
-          && tccDriver
-#endif
-          ) {
-        // This is an array of P2P capable GPUs
-        gpuid[gpu_p2p_count++] = i;
-      }
-    }
-
-    // Show all the combinations of support P2P GPUs
-    int can_access_peer;
-    char deviceName0[256], deviceName1[256];
-
-    if (gpu_p2p_count >= 2) {
-      for (int i = 0; i < gpu_p2p_count; i++) {
-        for (int j = 0; j < gpu_p2p_count; j++) {
-          if (gpuid[i] == gpuid[j]) {
-            continue;
-          }
-          checkCudaErrors(
-              cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
-          checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
-          checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
-          printf(
-              "> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
-              "%s\n",
-              deviceName0, gpuid[i], deviceName1, gpuid[j],
-              can_access_peer ? "Yes" : "No");
+        if (L2CacheSize) {
+            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }
-      }
+
+        int maxTex1D, maxTex2D[2], maxTex3D[3];
+        getCudaAttribute<int>(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
+        getCudaAttribute<int>(&maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
+        getCudaAttribute<int>(&maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
+        getCudaAttribute<int>(&maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
+        getCudaAttribute<int>(&maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
+        getCudaAttribute<int>(&maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
+        printf("  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) "
+               "3D=(%d, %d, %d)\n",
+               maxTex1D,
+               maxTex2D[0],
+               maxTex2D[1],
+               maxTex3D[0],
+               maxTex3D[1],
+               maxTex3D[2]);
+
+        int maxTex1DLayered[2];
+        getCudaAttribute<int>(&maxTex1DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev);
+        getCudaAttribute<int>(&maxTex1DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev);
+        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
+               maxTex1DLayered[0],
+               maxTex1DLayered[1]);
+
+        int maxTex2DLayered[3];
+        getCudaAttribute<int>(&maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev);
+        getCudaAttribute<int>(&maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev);
+        getCudaAttribute<int>(&maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev);
+        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
+               "layers\n",
+               maxTex2DLayered[0],
+               maxTex2DLayered[1],
+               maxTex2DLayered[2]);
+
+        int totalConstantMemory;
+        getCudaAttribute<int>(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
+        printf("  Total amount of constant memory:               %u bytes\n", totalConstantMemory);
+        int sharedMemPerBlock;
+        getCudaAttribute<int>(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
+        printf("  Total amount of shared memory per block:       %u bytes\n", sharedMemPerBlock);
+        int regsPerBlock;
+        getCudaAttribute<int>(&regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
+        printf("  Total number of registers available per block: %d\n", regsPerBlock);
+        int warpSize;
+        getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
+        printf("  Warp size:                                     %d\n", warpSize);
+        int maxThreadsPerMultiProcessor;
+        getCudaAttribute<int>(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
+        printf("  Maximum number of threads per multiprocessor:  %d\n", maxThreadsPerMultiProcessor);
+        int maxThreadsPerBlock;
+        getCudaAttribute<int>(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
+        printf("  Maximum number of threads per block:           %d\n", maxThreadsPerBlock);
+
+        int blockDim[3];
+        getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
+        getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
+        getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
+        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", blockDim[0], blockDim[1], blockDim[2]);
+        int gridDim[3];
+        getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
+        getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
+        getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
+        printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)\n", gridDim[0], gridDim[1], gridDim[2]);
+
+        int textureAlign;
+        getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev);
+        printf("  Texture alignment:                             %u bytes\n", textureAlign);
+
+        int memPitch;
+        getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
+        printf("  Maximum memory pitch:                          %u bytes\n", memPitch);
+
+        int gpuOverlap;
+        getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
+
+        int asyncEngineCount;
+        getCudaAttribute<int>(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
+        printf("  Concurrent copy and kernel execution:          %s with %d copy "
+               "engine(s)\n",
+               (gpuOverlap ? "Yes" : "No"),
+               asyncEngineCount);
+
+        int kernelExecTimeoutEnabled;
+        getCudaAttribute<int>(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
+        printf("  Run time limit on kernels:                     %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
+        int integrated;
+        getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
+        printf("  Integrated GPU sharing Host Memory:            %s\n", integrated ? "Yes" : "No");
+        int canMapHostMemory;
+        getCudaAttribute<int>(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
+        printf("  Support host page-locked memory mapping:       %s\n", canMapHostMemory ? "Yes" : "No");
+
+        int concurrentKernels;
+        getCudaAttribute<int>(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
+        printf("  Concurrent kernel execution:                   %s\n", concurrentKernels ? "Yes" : "No");
+
+        int surfaceAlignment;
+        getCudaAttribute<int>(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
+        printf("  Alignment requirement for Surfaces:            %s\n", surfaceAlignment ? "Yes" : "No");
+
+        int eccEnabled;
+        getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
+        printf("  Device has ECC support:                        %s\n", eccEnabled ? "Enabled" : "Disabled");
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        int tccDriver;
+        getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
+        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
+               tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
+#endif
+
+        int unifiedAddressing;
+        getCudaAttribute<int>(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
+        printf("  Device supports Unified Addressing (UVA):      %s\n", unifiedAddressing ? "Yes" : "No");
+
+        int managedMemory;
+        getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, dev);
+        printf("  Device supports Managed Memory:                %s\n", managedMemory ? "Yes" : "No");
+
+        int computePreemption;
+        getCudaAttribute<int>(&computePreemption, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, dev);
+        printf("  Device supports Compute Preemption:            %s\n", computePreemption ? "Yes" : "No");
+
+        int cooperativeLaunch;
+        getCudaAttribute<int>(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
+        printf("  Supports Cooperative Kernel Launch:            %s\n", cooperativeLaunch ? "Yes" : "No");
+
+        int cooperativeMultiDevLaunch;
+        getCudaAttribute<int>(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev);
+        printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n", cooperativeMultiDevLaunch ? "Yes" : "No");
+
+        int pciDomainID, pciBusID, pciDeviceID;
+        getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
+        getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
+        getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
+        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", pciDomainID, pciBusID, pciDeviceID);
+
+        const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
+                                      "simultaneously)",
+                                      "Exclusive (only one host thread in one process is able to use "
+                                      "::cudaSetDevice() with this device)",
+                                      "Prohibited (no host thread can use ::cudaSetDevice() with this "
+                                      "device)",
+                                      "Exclusive Process (many threads in one process is able to use "
+                                      "::cudaSetDevice() with this device)",
+                                      "Unknown",
+                                      NULL};
+
+        int computeMode;
+        getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
+        printf("  Compute Mode:\n");
+        printf("     < %s >\n", sComputeMode[computeMode]);
    }
-  }

-  printf("Result = PASS\n");
+    // If there are 2 or more GPUs, query to determine whether RDMA is supported
+    if (deviceCount >= 2) {
+        int gpuid[64]; // we want to find the first two GPUs that can support P2P
+        int gpu_p2p_count = 0;
+        int tccDriver     = 0;

-  exit(EXIT_SUCCESS);
+        for (int i = 0; i < deviceCount; i++) {
+            checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
+            checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
+            getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
+
+            // Only boards based on Fermi or later can support P2P
+            if ((major >= 2)
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+                // on Windows (64-bit), the Tesla Compute Cluster driver for windows
+                // must be enabled to support this
+                && tccDriver
+#endif
+            ) {
+                // This is an array of P2P capable GPUs
+                gpuid[gpu_p2p_count++] = i;
+            }
+        }
+
+        // Show all the combinations of support P2P GPUs
+        int  can_access_peer;
+        char deviceName0[256], deviceName1[256];
+
+        if (gpu_p2p_count >= 2) {
+            for (int i = 0; i < gpu_p2p_count; i++) {
+                for (int j = 0; j < gpu_p2p_count; j++) {
+                    if (gpuid[i] == gpuid[j]) {
+                        continue;
+                    }
+                    checkCudaErrors(cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
+                    checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
+                    checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
+                    printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
+                           "%s\n",
+                           deviceName0,
+                           gpuid[i],
+                           deviceName1,
+                           gpuid[j],
+                           can_access_peer ? "Yes" : "No");
+                }
+            }
+        }
+    }
+
+    printf("Result = PASS\n");
+
+    exit(EXIT_SUCCESS);
 }
--- a/Samples/1_Utilities/topologyQuery/README.md
+++ b/Samples/1_Utilities/topologyQuery/README.md
@ -30,4 +30,3 @@ cudaGetDeviceCount, cudaDeviceGetAttribute
 Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## References (for more details)
-
--- a/Samples/1_Utilities/topologyQuery/topologyQuery.cu
+++ b/Samples/1_Utilities/topologyQuery/topologyQuery.cu
@ -35,48 +35,44 @@

 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
+#include <helper_functions.h> // helper for shared that are common to CUDA Samples

-int main(int argc, char **argv) {
-  int deviceCount = 0;
-  checkCudaErrors(cudaGetDeviceCount(&deviceCount));
+int main(int argc, char **argv)
+{
+    int deviceCount = 0;
+    checkCudaErrors(cudaGetDeviceCount(&deviceCount));

-  // Enumerates Device <-> Device links
-  for (int device1 = 0; device1 < deviceCount; device1++) {
-    for (int device2 = 0; device2 < deviceCount; device2++) {
-      if (device1 == device2) continue;
+    // Enumerates Device <-> Device links
+    for (int device1 = 0; device1 < deviceCount; device1++) {
+        for (int device2 = 0; device2 < deviceCount; device2++) {
+            if (device1 == device2)
+                continue;

-      int perfRank = 0;
-      int atomicSupported = 0;
-      int accessSupported = 0;
+            int perfRank        = 0;
+            int atomicSupported = 0;
+            int accessSupported = 0;

-      checkCudaErrors(cudaDeviceGetP2PAttribute(
-          &accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
-      checkCudaErrors(cudaDeviceGetP2PAttribute(
-          &perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
-      checkCudaErrors(cudaDeviceGetP2PAttribute(
-          &atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1,
-          device2));
+            checkCudaErrors(
+                cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
+            checkCudaErrors(cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
+            checkCudaErrors(
+                cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2));

-      if (accessSupported) {
-        std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":"
-                  << std::endl;
-        std::cout << "  * Atomic Supported: "
-                  << (atomicSupported ? "yes" : "no") << std::endl;
-        std::cout << "  * Perf Rank: " << perfRank << std::endl;
-      }
+            if (accessSupported) {
+                std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl;
+                std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
+                std::cout << "  * Perf Rank: " << perfRank << std::endl;
+            }
+        }
    }
-  }

-  // Enumerates Device <-> Host links
-  for (int device = 0; device < deviceCount; device++) {
-    int atomicSupported = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(
-        &atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
-    std::cout << "GPU" << device << " <-> CPU:" << std::endl;
-    std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no")
-              << std::endl;
-  }
+    // Enumerates Device <-> Host links
+    for (int device = 0; device < deviceCount; device++) {
+        int atomicSupported = 0;
+        checkCudaErrors(cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
+        std::cout << "GPU" << device << " <-> CPU:" << std::endl;
+        std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
+    }

-  return 0;
+    return 0;
 }
--- a/Show More
+++ b/Show More