Merge branch 'master' into cuda_a_dev

2025-12-16 10:37:48 +08:00 · 2025-03-27 10:38:16 -07:00 · 2025-03-27 10:38:16 -07:00 · eddc6fd7e1
commit eddc6fd7e1
parent 7ceb3122fc ceab6e8bcc
783 changed files with 107231 additions and 106549 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,49 @@
 ---
 AccessModifierOffset: -4
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: Consecutive
 AlignConsecutiveDeclarations: Consecutive
 AlignConsecutiveMacros: Consecutive
 AlignEscapedNewlines: Left
 AlignOperands: AlignAfterOperator
 AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: false
 BinPackArguments: false
 BinPackParameters: false
 BraceWrapping:
    AfterClass: true
    AfterControlStatement: false
    AfterExternBlock: true
    AfterFunction: true
    AfterStruct: true
    AfterUnion: true
    BeforeCatch: true
    BeforeElse: true
    IndentBraces: false
 BreakBeforeBraces: Custom
 BreakBeforeConceptDeclarations: true
 BreakBeforeBinaryOperators: NonAssignment
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializers: BeforeComma
 BreakInheritanceList: BeforeComma
 ColumnLimit: 120
 DerivePointerAlignment: false
 FixNamespaceComments: true
 IncludeCategories:
  - Regex:           '^<.*>'
    Priority:        1
  - Regex:           '^".*"'
    Priority:        2
 SortIncludes: true
 IncludeBlocks: Regroup
 IndentWidth: 4
 MaxEmptyLinesToKeep: 2
 PointerAlignment: Right
 SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
 Standard: c++17
 TabWidth: 4
 UseTab: Never
 ...
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,100 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 ci:
    autofix_commit_msg: |
      [pre-commit.ci] auto code formatting
    autofix_prs: false
    autoupdate_branch: ''
    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
    autoupdate_schedule: quarterly
    skip: []
    submodules: false
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v5.0.0
    hooks:
      - id: end-of-file-fixer
        exclude: |
          (?x)^(
            .*\.raw$|
            .*\.bin$|
            .*\.dat$|
            .*\.nv12$|
            data/.*|
            Common/.*
          )
        files: |
          (?x)^(
            .*\.txt$|
            .*\.md$|
            .*\.cpp$|
            .*\.cxx$|
            .*\.hpp$|
            .*\.h$|
            .*\.cu$|
            .*\.cuh$
          )
      - id: mixed-line-ending
        exclude: |
          (?x)^(
            .*\.raw$|
            .*\.bin$|
            .*\.dat$|
            .*\.nv12$|
            data/.*|
            Common/.*
          )
        files: |
          (?x)^(
            .*\.txt$|
            .*\.md$|
            .*\.cpp$|
            .*\.cxx$|
            .*\.hpp$|
            .*\.h$|
            .*\.cu$|
            .*\.cuh$
          )
      - id: trailing-whitespace
        exclude: |
          (?x)^(
            .*\.raw$|
            .*\.bin$|
            .*\.dat$|
            .*\.nv12$|
            data/.*|
            Common/.*
          )
        files: |
          (?x)^(
            .*\.txt$|
            .*\.md$|
            .*\.cpp$|
            .*\.cxx$|
            .*\.hpp$|
            .*\.h$|
            .*\.cu$|
            .*\.cuh$
          )
  - repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v19.1.6
    hooks:
      - id: clang-format
        types_or: [file]
        files: |
          (?x)^(
            ^.*\.c$|
            ^.*\.cpp$|
            ^.*\.cu$|
            ^.*\.cuh$|
            ^.*\.cxx$|
            ^.*\.h$|
            ^.*\.hpp$|
            ^.*\.inl$|
            ^.*\.mm$
          )
        exclude: |
          (?x)^(
            Common/.*
          )
        args: ["-fallback-style=none", "-style=file", "-i"]
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # CUDA Samples
-Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.8](https://developer.nvidia.com/cuda-downloads).
+Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.9](https://developer.nvidia.com/cuda-downloads).
 ## Release Notes
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu
@ -31,10 +31,10 @@
 */
 // system includes
 #include <algorithm>
 #include <cstdio>
 #include <ctime>
 #include <vector>
 #include <algorithm>
 #ifdef USE_PTHREADS
 #include <pthread.h>
 #else
@ -58,15 +58,25 @@ double drand48() { return double(rand()) / RAND_MAX; }
 const char *sSDKname = "UnifiedMemoryStreams";
 // simple task
-template <typename T>
+template <typename T> struct Task
-struct Task {
+{
    unsigned int size, id;
    T           *data;
    T           *result;
    T           *vector;
-  Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){};
+    Task()
-  Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) {
+        : size(0)
        , id(0)
        , data(NULL)
        , result(NULL)
        , vector(NULL) {};
    Task(unsigned int s)
        : size(s)
        , id(0)
        , data(NULL)
        , result(NULL)
    {
        // allocate unified memory -- the operation performed in this example will
        // be a DGEMV
        checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
@ -75,7 +85,8 @@ struct Task {
        checkCudaErrors(cudaDeviceSynchronize());
    }
-  ~Task() {
+    ~Task()
    {
        // ensure all memory is deallocated
        checkCudaErrors(cudaDeviceSynchronize());
        checkCudaErrors(cudaFree(data));
@ -83,7 +94,8 @@ struct Task {
        checkCudaErrors(cudaFree(vector));
    }
-  void allocate(const unsigned int s, const unsigned int unique_id) {
+    void allocate(const unsigned int s, const unsigned int unique_id)
    {
        // allocate unified memory outside of constructor
        id   = unique_id;
        size = s;
@ -105,7 +117,8 @@ struct Task {
 };
 #ifdef USE_PTHREADS
-struct threadData_t {
+struct threadData_t
 {
    int             tid;
    Task<double>   *TaskListPtr;
    cudaStream_t   *streams;
@ -117,8 +130,8 @@ typedef struct threadData_t threadData;
 #endif
 // simple host dgemv: assume data is in row-major format and square
-template <typename T>
+template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
-void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
+{
    // rows
    for (int i = 0; i < n; i++) {
        result[i] *= beta;
@ -131,7 +144,8 @@ void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
 // execute a single task on either host or device depending on size
 #ifdef USE_PTHREADS
-void *execute(void *inpArgs) {
+void *execute(void *inpArgs)
 {
    threadData     *dataPtr = (threadData *)inpArgs;
    cudaStream_t   *stream  = dataPtr->streams;
    cublasHandle_t *handle  = dataPtr->handles;
@ -142,92 +156,75 @@ void *execute(void *inpArgs) {
        if (t.size < 100) {
            // perform on host
-      printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
+            printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
             t.size);
            // attach managed memory to a (dummy) stream to allow host access while
            // the device is running
-      checkCudaErrors(
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
-          cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
-      checkCudaErrors(
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
          cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
      checkCudaErrors(
          cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
            // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
            checkCudaErrors(cudaStreamSynchronize(stream[0]));
            // call the host operation
            gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
-    } else {
+        }
        else {
            // perform on device
-      printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
+            printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
             t.size);
            double one  = 1.0;
            double zero = 0.0;
            // attach managed memory to my stream
            checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
-      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
-                                               cudaMemAttachSingle));
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
-      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
                                               cudaMemAttachSingle));
      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
                                               cudaMemAttachSingle));
            // call the device operation
-      checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
+            checkCudaErrors(cublasDgemv(
-                                  &one, t.data, t.size, t.vector, 1, &zero,
+                handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
                                  t.result, 1));
        }
    }
    pthread_exit(NULL);
 }
 #else
-template <typename T>
+template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
-void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream,
+{
             int tid) {
    if (t.size < 100) {
        // perform on host
-    printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
+        printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
           t.size);
        // attach managed memory to a (dummy) stream to allow host access while the
        // device is running
-    checkCudaErrors(
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
-        cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
-    checkCudaErrors(
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
        cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
    checkCudaErrors(
        cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
        // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
        checkCudaErrors(cudaStreamSynchronize(stream[0]));
        // call the host operation
        gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
-  } else {
+    }
    else {
        // perform on device
-    printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
+        printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
           t.size);
        double one  = 1.0;
        double zero = 0.0;
        // attach managed memory to my stream
        checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
-    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
-                                             cudaMemAttachSingle));
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
-    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
                                             cudaMemAttachSingle));
    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
                                             cudaMemAttachSingle));
        // call the device operation
-    checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
+        checkCudaErrors(cublasDgemv(
-                                &one, t.data, t.size, t.vector, 1, &zero,
+            handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
                                t.result, 1));
    }
 }
 #endif
 // populate a list of tasks with random sizes
-template <typename T>
+template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
-void initialise_tasks(std::vector<Task<T> > &TaskList) {
+{
    for (unsigned int i = 0; i < TaskList.size(); i++) {
        // generate random size
        int size;
@ -236,7 +233,8 @@ void initialise_tasks(std::vector<Task<T> > &TaskList) {
    }
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    // set device
    cudaDeviceProp device_prop;
    int            dev_id = findCudaDevice(argc, (const char **)argv);
@ -294,19 +292,17 @@ int main(int argc, char **argv) {
        if ((TaskList.size() / nthreads) == 0) {
            InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
-      InputToThreads[i].TaskListPtr =
+            InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
-          &TaskList[i * (TaskList.size() / nthreads)];
+        }
-    } else {
+        else {
            if (i == nthreads - 1) {
-        InputToThreads[i].taskSize =
+                InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
            (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
                InputToThreads[i].TaskListPtr =
-            &TaskList[i * (TaskList.size() / nthreads) +
+                    &TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
-                      (TaskList.size() % nthreads)];
+            }
-      } else {
+            else {
                InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
-        InputToThreads[i].TaskListPtr =
+                InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
            &TaskList[i * (TaskList.size() / nthreads)];
            }
        }
--- a/Samples/0_Introduction/asyncAPI/asyncAPI.cu
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI.cu
@ -38,19 +38,21 @@
 #include <stdio.h>
 // includes CUDA Runtime
 #include <cuda_runtime.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 // includes, project
 #include <helper_cuda.h>
 #include <helper_functions.h> // helper utility functions
-__global__ void increment_kernel(int *g_data, int inc_value) {
+__global__ void increment_kernel(int *g_data, int inc_value)
 {
    int idx     = blockIdx.x * blockDim.x + threadIdx.x;
    g_data[idx] = g_data[idx] + inc_value;
 }
-bool correct_output(int *data, const int n, const int x) {
+bool correct_output(int *data, const int n, const int x)
 {
    for (int i = 0; i < n; i++)
        if (data[i] != x) {
            printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
@ -60,7 +62,8 @@ bool correct_output(int *data, const int n, const int x) {
    return true;
 }
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
 {
    int            devID;
    cudaDeviceProp deviceProps;
@ -126,8 +129,7 @@ int main(int argc, char *argv[]) {
    // print the cpu and gpu times
    printf("time spent executing by the GPU: %.2f\n", gpu_time);
    printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
-  printf("CPU executed %lu iterations while waiting for GPU to finish\n",
+    printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
         counter);
    // check the output for correctness
    bool bFinalResults = correct_output(a, n, value);
--- a/Samples/0_Introduction/clock/clock.cu
+++ b/Samples/0_Introduction/clock/clock.cu
@ -48,15 +48,16 @@
 // This kernel computes a standard parallel reduction and evaluates the
 // time it takes to do that for each block. The timing results are stored
 // in device memory.
-__global__ static void timedReduction(const float *input, float *output,
+__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
-                                      clock_t *timer) {
+{
    // __shared__ float shared[2 * blockDim.x];
    extern __shared__ float shared[];
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
-  if (tid == 0) timer[bid] = clock();
+    if (tid == 0)
        timer[bid] = clock();
    // Copy input.
    shared[tid]              = input[tid];
@ -77,11 +78,13 @@ __global__ static void timedReduction(const float *input, float *output,
    }
    // Write result.
-  if (tid == 0) output[bid] = shared[0];
+    if (tid == 0)
        output[bid] = shared[0];
    __syncthreads();
-  if (tid == 0) timer[bid + gridDim.x] = clock();
+    if (tid == 0)
        timer[bid + gridDim.x] = clock();
 }
 #define NUM_BLOCKS  64
@ -104,7 +107,8 @@ __global__ static void timedReduction(const float *input, float *output,
 // the memory. With more than 32 the speed scales linearly.
 // Start the main CUDA Sample here
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("CUDA Clock sample\n");
    // This will pick the best possible CUDA capable device
@ -121,20 +125,15 @@ int main(int argc, char **argv) {
        input[i] = (float)i;
    }
-  checkCudaErrors(
+    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
      cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
    checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
-  checkCudaErrors(
+    checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
      cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
-  checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2,
+    checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
                             cudaMemcpyHostToDevice));
-  timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(
+    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
      dinput, doutput, dtimer);
-  checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2,
+    checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
                             cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaFree(dinput));
    checkCudaErrors(cudaFree(doutput));
--- a/Samples/0_Introduction/clock_nvrtc/clock.cpp
+++ b/Samples/0_Introduction/clock_nvrtc/clock.cpp
@ -34,12 +34,11 @@
 */
 // System includes
 #include <stdio.h>
 #include <stdint.h>
 #include <assert.h>
 #include <cuda_runtime.h>
 #include <nvrtc_helper.h>
 #include <stdint.h>
 #include <stdio.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
@ -71,7 +70,8 @@
 // Start the main CUDA Sample here
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("CUDA Clock sample\n");
    typedef long clock_t;
@ -106,17 +106,20 @@ int main(int argc, char **argv) {
    void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
-  checkCudaErrors(cuLaunchKernel(
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
-      kernel_addr, cudaGridSize.x, cudaGridSize.y,
+                                   cudaGridSize.x,
                                   cudaGridSize.y,
                                   cudaGridSize.z, /* grid dim */
-      cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
+                                   cudaBlockSize.x,
-      sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */
+                                   cudaBlockSize.y,
                                   cudaBlockSize.z, /* block dim */
                                   sizeof(float) * 2 * NUM_THREADS,
                                   0,       /* shared mem, stream */
                                   &arr[0], /* arguments */
                                   0));
    checkCudaErrors(cuCtxSynchronize());
-  checkCudaErrors(
+    checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
      cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
    checkCudaErrors(cuMemFree(dinput));
    checkCudaErrors(cuMemFree(doutput));
    checkCudaErrors(cuMemFree(dtimer));
--- a/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu
+++ b/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu
@ -37,15 +37,16 @@
 // time it takes to do that for each block. The timing results are stored
 // in device memory.
-extern "C" __global__ void timedReduction(const float *input, float *output,
+extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
-                                          clock_t *timer) {
+{
    // __shared__ float shared[2 * blockDim.x];
    extern __shared__ float shared[];
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
-  if (tid == 0) timer[bid] = clock();
+    if (tid == 0)
        timer[bid] = clock();
    // Copy input.
    shared[tid]              = input[tid];
@ -66,9 +67,11 @@ extern "C" __global__ void timedReduction(const float *input, float *output,
    }
    // Write result.
-  if (tid == 0) output[bid] = shared[0];
+    if (tid == 0)
        output[bid] = shared[0];
    __syncthreads();
-  if (tid == 0) timer[bid + gridDim.x] = clock();
+    if (tid == 0)
        timer[bid + gridDim.x] = clock();
 }
--- a/Samples/0_Introduction/cudaOpenMP/cudaOpenMP.cu
+++ b/Samples/0_Introduction/cudaOpenMP/cudaOpenMP.cu
@ -37,20 +37,24 @@
 using namespace std;
 // a simple kernel that simply increments each array element by b
-__global__ void kernelAddConstant(int *g_a, const int b) {
+__global__ void kernelAddConstant(int *g_a, const int b)
 {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    g_a[idx] += b;
 }
 // a predicate that checks whether each array element is set to its index plus b
-int correctResult(int *data, const int n, const int b) {
+int correctResult(int *data, const int n, const int b)
 {
    for (int i = 0; i < n; i++)
-    if (data[i] != i + b) return 0;
+        if (data[i] != i + b)
            return 0;
    return 1;
 }
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
 {
    int num_gpus = 0; // number of CUDA GPUs
    printf("%s Starting...\n\n", argv[0]);
@ -93,7 +97,8 @@ int main(int argc, char *argv[]) {
        return 1;
    }
-  for (unsigned int i = 0; i < n; i++) a[i] = i;
+    for (unsigned int i = 0; i < n; i++)
        a[i] = i;
    ////////////////////////////////////////////////////////////////
    // run as many CPU threads as there are CUDA devices
@ -105,8 +110,7 @@ int main(int argc, char *argv[]) {
    //   Recall that all variables declared inside an "omp parallel" scope are
    //   local to each CPU thread
    //
-  omp_set_num_threads(
+    omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
      num_gpus);  // create as many CPU threads as there are CUDA devices
 // omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
 // are CUDA devices
 #pragma omp parallel
@ -116,31 +120,23 @@ int main(int argc, char *argv[]) {
        // set and check the CUDA device for this CPU thread
        int gpu_id = -1;
-    checkCudaErrors(cudaSetDevice(
+        checkCudaErrors(
-        cpu_thread_id %
+            cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
        num_gpus));  // "% num_gpus" allows more CPU threads than GPU devices
        checkCudaErrors(cudaGetDevice(&gpu_id));
-    printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,
+        printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
           num_cpu_threads, gpu_id);
-    int *d_a =
+        int         *d_a   = 0; // pointer to memory on the device associated with this CPU thread
-        0;  // pointer to memory on the device associated with this CPU thread
+        int         *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
    int *sub_a =
        a +
        cpu_thread_id * n /
            num_cpu_threads;  // pointer to this CPU thread's portion of data
        unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
        dim3         gpu_threads(128); // 128 threads per block
        dim3         gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
        checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
        checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
-    checkCudaErrors(
+        checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
        cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
        kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
-    checkCudaErrors(
+        checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
        cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
        checkCudaErrors(cudaFree(d_a));
    }
    printf("---------------------------\n");
@ -153,7 +149,8 @@ int main(int argc, char *argv[]) {
    //
    bool bResult = correctResult(a, n, b);
-  if (a) free(a);  // free CPU memory
+    if (a)
        free(a); // free CPU memory
    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/fp16ScalarProduct/fp16ScalarProduct.cu
+++ b/Samples/0_Introduction/fp16ScalarProduct/fp16ScalarProduct.cu
@ -25,17 +25,18 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include "cuda_fp16.h"
 #include "helper_cuda.h"
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
 #include "cuda_fp16.h"
 #include "helper_cuda.h"
 #define NUM_OF_BLOCKS  128
 #define NUM_OF_THREADS 128
-__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
+__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
 {
    if (threadIdx.x < 64)
        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
    __syncthreads();
@ -59,27 +60,34 @@ __forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
    __syncthreads();
 }
-__forceinline__ __device__ void reduceInShared_native(half2 *const v) {
+__forceinline__ __device__ void reduceInShared_native(half2 *const v)
-  if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
+{
    if (threadIdx.x < 64)
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
    __syncthreads();
-  if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
+    if (threadIdx.x < 32)
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
    __syncthreads();
-  if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
+    if (threadIdx.x < 16)
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
    __syncthreads();
-  if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
+    if (threadIdx.x < 8)
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
    __syncthreads();
-  if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
+    if (threadIdx.x < 4)
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
    __syncthreads();
-  if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
+    if (threadIdx.x < 2)
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
    __syncthreads();
-  if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
+    if (threadIdx.x < 1)
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
    __syncthreads();
 }
-__global__ void scalarProductKernel_intrinsics(half2 const *const a,
+__global__ void
-                                               half2 const *const b,
+scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size)
-                                               float *const results,
+{
                                               size_t const size) {
    const int        stride = gridDim.x * blockDim.x;
    __shared__ half2 shArray[NUM_OF_THREADS];
@ -101,10 +109,9 @@ __global__ void scalarProductKernel_intrinsics(half2 const *const a,
    }
 }
-__global__ void scalarProductKernel_native(half2 const *const a,
+__global__ void
-                                           half2 const *const b,
+scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size)
-                                           float *const results,
+{
                                           size_t const size) {
    const int        stride = gridDim.x * blockDim.x;
    __shared__ half2 shArray[NUM_OF_THREADS];
@ -126,7 +133,8 @@ __global__ void scalarProductKernel_native(half2 const *const a,
    }
 }
-void generateInput(half2 *a, size_t size) {
+void generateInput(half2 *a, size_t size)
 {
    for (size_t i = 0; i < size; ++i) {
        half2 temp;
        temp.x = static_cast<float>(rand() % 4);
@ -135,7 +143,8 @@ void generateInput(half2 *a, size_t size) {
    }
 }
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
 {
    srand((unsigned int)time(NULL));
    size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
@ -151,8 +160,7 @@ int main(int argc, char *argv[]) {
    checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
    if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
-    printf(
+        printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
        "ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
               "higher.\n");
        return EXIT_WAIVED;
    }
@ -162,23 +170,17 @@ int main(int argc, char *argv[]) {
        checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
    }
-  checkCudaErrors(
+    checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
-      cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
+    checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
  checkCudaErrors(
      cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
    for (int i = 0; i < 2; ++i) {
        generateInput(vec[i], size);
-    checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i],
+        checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice));
                               cudaMemcpyHostToDevice));
    }
-  scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
+    scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
      devVec[0], devVec[1], devResults, size);
-  checkCudaErrors(cudaMemcpy(results, devResults,
+    checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
                             NUM_OF_BLOCKS * sizeof *results,
                             cudaMemcpyDeviceToHost));
    float result_native = 0;
    for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
@ -186,12 +188,9 @@ int main(int argc, char *argv[]) {
    }
    printf("Result native operators\t: %f \n", result_native);
-  scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
+    scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
      devVec[0], devVec[1], devResults, size);
-  checkCudaErrors(cudaMemcpy(results, devResults,
+    checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
                             NUM_OF_BLOCKS * sizeof *results,
                             cudaMemcpyDeviceToHost));
    float result_intrinsics = 0;
    for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
@ -199,9 +198,7 @@ int main(int argc, char *argv[]) {
    }
    printf("Result intrinsics\t: %f \n", result_intrinsics);
-  printf("&&&& fp16ScalarProduct %s\n",
+    printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED");
         (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED"
                                                             : "FAILED");
    for (int i = 0; i < 2; ++i) {
        checkCudaErrors(cudaFree(devVec[i]));
--- a/Samples/0_Introduction/matrixMul/matrixMul.cu
+++ b/Samples/0_Introduction/matrixMul/matrixMul.cu
@ -40,24 +40,23 @@
 */
 // System includes
 #include <stdio.h>
 #include <assert.h>
 #include <stdio.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 // Helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 /**
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 * wA is A's width and wB is B's width
 */
-template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
+template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
-    float *B, int wA,
+{
    int wB) {
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;
@ -87,9 +86,7 @@ template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
-  for (int a = aBegin, b = bBegin;
+    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
       a <= aEnd;
       a += aStep, b += bStep) {
        // Declaration of the shared memory array As used to
        // store the sub-matrix of A
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
@ -128,7 +125,8 @@ template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
    C[c + wB * ty + tx] = Csub;
 }
-void ConstantInit(float *data, int size, float val) {
+void ConstantInit(float *data, int size, float val)
 {
    for (int i = 0; i < size; ++i) {
        data[i] = val;
    }
@ -137,9 +135,8 @@ void ConstantInit(float *data, int size, float val) {
 /**
 * Run a simple test of matrix multiplication using CUDA
 */
-int MatrixMultiply(int argc, char **argv,
+int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB)
-                   int block_size, const dim3 &dimsA,
+{
                   const dim3 &dimsB) {
    // Allocate host memory for matrices A and B
    unsigned int size_A     = dimsA.x * dimsA.y;
    unsigned int mem_size_A = sizeof(float) * size_A;
@ -181,10 +178,8 @@ int MatrixMultiply(int argc, char **argv,
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    // copy host memory to device
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
-      cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
  checkCudaErrors(
      cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
    // Setup execution parameters
    dim3 threads(block_size, block_size);
@ -195,11 +190,10 @@ int MatrixMultiply(int argc, char **argv,
    // Performs warmup operation using matrixMul CUDA kernel
    if (block_size == 16) {
-    MatrixMulCUDA<16>
+        MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+    }
-  } else {
+    else {
-    MatrixMulCUDA<32>
+        MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
    }
    printf("done\n");
@ -213,11 +207,10 @@ int MatrixMultiply(int argc, char **argv,
    for (int j = 0; j < nIter; j++) {
        if (block_size == 16) {
-      MatrixMulCUDA<16>
+            MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
-          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+        }
-    } else {
+        else {
-      MatrixMulCUDA<32>
+            MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
        }
    }
@ -232,19 +225,18 @@ int MatrixMultiply(int argc, char **argv,
    // Compute and print the performance
    float  msecPerMatrixMul = msecTotal / nIter;
-  double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
+    double flopsPerMatrixMul =
-                             static_cast<double>(dimsA.y) *
+        2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
-                             static_cast<double>(dimsB.x);
+    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
-  double gigaFlops =
+    printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
  printf(
      "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
           " WorkgroupSize= %u threads/block\n",
-      gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
+           gigaFlops,
           msecPerMatrixMul,
           flopsPerMatrixMul,
           threads.x * threads.y);
    // Copy result from device to host
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
      cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
    checkCudaErrors(cudaStreamSynchronize(stream));
    printf("Checking computed result for correctness: ");
@ -261,8 +253,7 @@ int MatrixMultiply(int argc, char **argv,
        double rel_err    = abs_err / abs_val / dot_length;
        if (rel_err > eps) {
-      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
+            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
             i, h_C[i], dimsA.x * valB, eps);
            correct = false;
        }
    }
@ -278,13 +269,13 @@ int MatrixMultiply(int argc, char **argv,
    checkCudaErrors(cudaFree(d_C));
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
-  printf(
+    printf("\nNOTE: The CUDA Samples are not meant for performance "
      "\nNOTE: The CUDA Samples are not meant for performance "
           "measurements. Results may vary when GPU Boost is enabled.\n");
    if (correct) {
        return EXIT_SUCCESS;
-  } else {
+    }
    else {
        return EXIT_FAILURE;
    }
 }
@ -293,15 +284,15 @@ int MatrixMultiply(int argc, char **argv,
 /**
 * Program main
 */
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("[Matrix Multiply Using CUDA] - Starting...\n");
-  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
+    if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
      checkCmdLineFlag(argc, (const char **)argv, "?")) {
        printf("Usage -device=n (n >= 0 for deviceID)\n");
        printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
        printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
-    printf("  Note: Outer matrix dimensions of A & B matrices" \
+        printf("  Note: Outer matrix dimensions of A & B matrices"
               " must be equal.\n");
        exit(EXIT_SUCCESS);
@ -337,13 +328,11 @@ int main(int argc, char **argv) {
    }
    if (dimsA.x != dimsB.y) {
-    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
+        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
           dimsA.x, dimsB.y);
        exit(EXIT_FAILURE);
    }
-  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
+    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
         dimsB.x, dimsB.y);
    checkCudaErrors(cudaProfilerStart());
    int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
--- a/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp
+++ b/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp
@ -46,23 +46,23 @@
 // includes, system
 #include <builtin_types.h>
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <iostream>
 #include <cstring>
 #include <iostream>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes, project, CUDA
 #include <cstring>
 #include <cuda.h>
 #include <helper_cuda_drvapi.h>
 #include <helper_image.h>
 #include <helper_string.h>
 #include <helper_timer.h>
 #include <cstring>
 #include <iostream>
 #include <string>
 #include "matrixMul.h"
@ -71,11 +71,9 @@
 void runTest(int argc, char **argv);
 void randomInit(float *, int);
-extern "C" void computeGold(float *, const float *, const float *, unsigned int,
+extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
                            unsigned int, unsigned int);
-static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
+static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size);
                    int *blk_size);
 #ifndef FATBIN_FILE
 #define FATBIN_FILE "matrixMul_kernel64.fatbin"
@ -91,7 +89,8 @@ size_t totalGlobalMem;
 const char *sSDKsample = "matrixMulDrv (Driver API)";
-void constantInit(float *data, int size, float val) {
+void constantInit(float *data, int size, float val)
 {
    for (int i = 0; i < size; ++i) {
        data[i] = val;
    }
@ -100,7 +99,8 @@ void constantInit(float *data, int size, float val) {
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("[ %s ]\n", sSDKsample);
    runTest(argc, argv);
@ -109,7 +109,8 @@ int main(int argc, char **argv) {
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
 {
    // initialize CUDA
    CUfunction matrixMul  = NULL;
    int        block_size = 0;
@ -172,10 +173,19 @@ void runTest(int argc, char **argv) {
        size_t Matrix_Width_B = (size_t)WB;
        void  *args[5]        = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
        // new CUDA 4.0 Driver API Kernel launch call
-    checkCudaErrors(cuLaunchKernel(
+        checkCudaErrors(cuLaunchKernel(matrixMul,
-        matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
+                                       grid.x,
-        2 * block_size * block_size * sizeof(float), NULL, args, NULL));
+                                       grid.y,
-  } else {
+                                       grid.z,
                                       block.x,
                                       block.y,
                                       block.z,
                                       2 * block_size * block_size * sizeof(float),
                                       NULL,
                                       args,
                                       NULL));
    }
    else {
        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
        // Launching (advanced method)
        int  offset = 0;
@ -198,14 +208,20 @@ void runTest(int argc, char **argv) {
        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
        offset += sizeof(Matrix_Width_B);
-    void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
+        void *kernel_launch_config[5] = {
-                                     CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
+            CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
                                     CU_LAUNCH_PARAM_END};
        // new CUDA 4.0 Driver API Kernel launch call
-    checkCudaErrors(cuLaunchKernel(
+        checkCudaErrors(cuLaunchKernel(matrixMul,
-        matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
+                                       grid.x,
-        2 * block_size * block_size * sizeof(float), NULL, NULL,
+                                       grid.y,
                                       grid.z,
                                       block.x,
                                       block.y,
                                       block.z,
                                       2 * block_size * block_size * sizeof(float),
                                       NULL,
                                       NULL,
                                       reinterpret_cast<void **>(&kernel_launch_config)));
    }
@ -222,8 +238,7 @@ void runTest(int argc, char **argv) {
    for (int i = 0; i < static_cast<int>(WC * HC); i++) {
        if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
-      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i,
+            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB);
             h_C[i], WA * valB);
            correct = false;
        }
    }
@ -244,14 +259,15 @@ void runTest(int argc, char **argv) {
 }
 // Allocates a matrix with random float entries.
-void randomInit(float *data, int size) {
+void randomInit(float *data, int size)
 {
    for (int i = 0; i < size; ++i) {
        data[i] = rand() / static_cast<float>(RAND_MAX);
    }
 }
-static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
+static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size)
-                    int *blk_size) {
+{
    CUfunction cuFunction = 0;
    int        major = 0, minor = 0;
    char       deviceName[100];
@ -259,16 +275,13 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
    // get compute capabilities and the devicename
-  checkCudaErrors(cuDeviceGetAttribute(
+    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
-      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
  checkCudaErrors(cuDeviceGetAttribute(
      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
    checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
    printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
    checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
-  printf("  Total amount of global memory:     %llu bytes\n",
+    printf("  Total amount of global memory:     %llu bytes\n", (long long unsigned int)totalGlobalMem);
         (long long unsigned int)totalGlobalMem);
    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
@ -278,7 +291,8 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
        exit(EXIT_FAILURE);
-  } else {
+    }
    else {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }
@ -291,8 +305,7 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
    // select the suitable kernel function
-  const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
+    const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"};
                           "matrixMul_bs8_64bit"};
    int idx        = 0;
    int block_size = 32;
@ -302,12 +315,12 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
        checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
        checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
-        &blocksPerGrid, &threadsPerBlock, cuFunction, 0,
+            &blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0));
        2 * block_size * block_size * sizeof(float), 0));
        if (block_size * block_size <= threadsPerBlock) {
            printf("> %d block size selected\n", block_size);
            break;
-    } else {
+        }
        else {
            block_size /= 2;
        }
        idx++;
--- a/Samples/0_Introduction/matrixMulDrv/matrixMul_kernel.cu
+++ b/Samples/0_Introduction/matrixMulDrv/matrixMul_kernel.cu
@ -42,8 +42,8 @@
 //! wA is A's width and wB is B's width
 ////////////////////////////////////////////////////////////////////////////////
 template <int block_size, typename size_type>
-__device__ void matrixMul(float *C, float *A, float *B, size_type wA,
+__device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
-                          size_type wB) {
+{
    // Block index
    size_type bx = blockIdx.x;
    size_type by = blockIdx.y;
@ -96,7 +96,8 @@ __device__ void matrixMul(float *C, float *A, float *B, size_type wA,
        // of the block sub-matrix
 #pragma unroll
-    for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);
+        for (size_type k = 0; k < block_size; ++k)
            Csub += AS(ty, k) * BS(k, tx);
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
@ -111,16 +112,16 @@ __device__ void matrixMul(float *C, float *A, float *B, size_type wA,
 }
 // C wrappers around our template kernel
-extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B,
+extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
-                                               size_t wA, size_t wB) {
+{
    matrixMul<8, size_t>(C, A, B, wA, wB);
 }
-extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
+extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
-                                                size_t wA, size_t wB) {
+{
    matrixMul<16, size_t>(C, A, B, wA, wB);
 }
-extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
+extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
-                                                size_t wA, size_t wB) {
+{
    matrixMul<32, size_t>(C, A, B, wA, wB);
 }
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c
@ -20,9 +20,10 @@
 // #define CUDA_INIT_D3D11
 // #define CUDA_INIT_OPENGL
 #include <stdio.h>
 #include "cuda_drvapi_dynlink.h"
 #include <stdio.h>
 tcuInit                    *_cuInit;
 tcuDriverGetVersion        *cuDriverGetVersion;
 tcuDeviceGet               *cuDeviceGet;
@ -239,8 +240,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 {
    *pInstance = LoadLibrary(__CudaLibName);
-    if (*pInstance == NULL)
+    if (*pInstance == NULL) {
    {
        printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
        return CUDA_ERROR_UNKNOWN;
    }
@ -251,24 +251,21 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 #define GET_PROC_EX(name, alias, required)                                               \
    alias = (t##name *)GetProcAddress(CudaDrvLib, #name);                                \
    if (alias == NULL && required) {                                                     \
-        printf("Failed to find required function \"%s\" in %s\n",       \
+        printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
               #name, __CudaLibName);                                  \
        return CUDA_ERROR_UNKNOWN;                                                       \
    }
 #define GET_PROC_EX_V2(name, alias, required)                                                           \
    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));                                \
    if (alias == NULL && required) {                                                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
+        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
               STRINGIFY(name##_v2), __CudaLibName);                       \
        return CUDA_ERROR_UNKNOWN;                                                                      \
    }
 #define GET_PROC_EX_V3(name, alias, required)                                                           \
    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));                                \
    if (alias == NULL && required) {                                                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
+        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
               STRINGIFY(name##_v3), __CudaLibName);                       \
        return CUDA_ERROR_UNKNOWN;                                                                      \
    }
@ -294,8 +291,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 {
    *pInstance = dlopen(__CudaLibName, RTLD_NOW);
-    if (*pInstance == NULL)
+    if (*pInstance == NULL) {
    {
        printf("dlopen \"%s\" failed!\n", __CudaLibName);
        return CUDA_ERROR_UNKNOWN;
    }
@ -306,24 +302,21 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 #define GET_PROC_EX(name, alias, required)                                               \
    alias = (t##name *)dlsym(CudaDrvLib, #name);                                         \
    if (alias == NULL && required) {                                                     \
-        printf("Failed to find required function \"%s\" in %s\n",       \
+        printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
               #name, __CudaLibName);                                  \
        return CUDA_ERROR_UNKNOWN;                                                       \
    }
 #define GET_PROC_EX_V2(name, alias, required)                                                           \
    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2));                                         \
    if (alias == NULL && required) {                                                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
+        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
               STRINGIFY(name##_v2), __CudaLibName);                    \
        return CUDA_ERROR_UNKNOWN;                                                                      \
    }
 #define GET_PROC_EX_V3(name, alias, required)                                                           \
    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3));                                         \
    if (alias == NULL && required) {                                                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
+        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
               STRINGIFY(name##_v3), __CudaLibName);                    \
        return CUDA_ERROR_UNKNOWN;                                                                      \
    }
@ -359,8 +352,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    // available since 2.2. if not present, version 1.0 is assumed
    GET_PROC_OPTIONAL(cuDriverGetVersion);
-    if (cuDriverGetVersion)
+    if (cuDriverGetVersion) {
    {
        CHECKED_CALL(cuDriverGetVersion(&driverVer));
    }
@ -428,24 +420,21 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    GET_PROC(cuStreamDestroy);
    // These are CUDA 5.0 new functions
-    if (driverVer >= 5000)
+    if (driverVer >= 5000) {
    {
        GET_PROC(cuMipmappedArrayCreate);
        GET_PROC(cuMipmappedArrayDestroy);
        GET_PROC(cuMipmappedArrayGetLevel);
    }
    // These are CUDA 4.2 new functions
-    if (driverVer >= 4020)
+    if (driverVer >= 4020) {
    {
        GET_PROC(cuFuncSetSharedMemConfig);
        GET_PROC(cuCtxGetSharedMemConfig);
        GET_PROC(cuCtxSetSharedMemConfig);
    }
    // These are CUDA 4.1 new functions
-    if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
+    if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
    {
        GET_PROC(cuDeviceGetByPCIBusId);
        GET_PROC(cuDeviceGetPCIBusId);
        GET_PROC(cuIpcGetEventHandle);
@ -456,8 +445,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    }
    // These could be _v2 interfaces
-    if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000)
+    if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) {
    {
        GET_PROC_V2(cuCtxDestroy);
        GET_PROC_V2(cuCtxPopCurrent);
        GET_PROC_V2(cuCtxPushCurrent);
@ -465,8 +453,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC_V2(cuEventDestroy);
    }
-    if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
+    if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
    {
        GET_PROC_V2(cuDeviceTotalMem);
        GET_PROC_V2(cuCtxCreate);
        GET_PROC_V2(cuModuleGetGlobal);
@ -507,17 +494,14 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC_V2(cuTexRefSetAddress);
        GET_PROC_V2(cuTexRefGetAddress);
-        if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
+        if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
        {
            GET_PROC_V3(cuTexRefSetAddress2D);
        }
-        else
+        else {
        {
            GET_PROC_V2(cuTexRefSetAddress2D);
        }
    }
-    else
+    else {
    {
        // versions earlier than 3020
        GET_PROC(cuDeviceTotalMem);
        GET_PROC(cuCtxCreate);
@ -562,8 +546,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    }
    // The following functions are specific to CUDA versions
-    if (driverVer >= 4000)
+    if (driverVer >= 4000) {
    {
        GET_PROC(cuCtxSetCurrent);
        GET_PROC(cuCtxGetCurrent);
        GET_PROC(cuMemHostRegister);
@ -574,8 +557,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuProfilerStop);
    }
-    if (driverVer >= 3010)
+    if (driverVer >= 3010) {
    {
        GET_PROC(cuModuleGetSurfRef);
        GET_PROC(cuSurfRefSetArray);
        GET_PROC(cuSurfRefGetArray);
@ -583,8 +565,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuCtxGetLimit);
    }
-    if (driverVer >= 3000)
+    if (driverVer >= 3000) {
    {
        GET_PROC(cuMemcpyDtoDAsync);
        GET_PROC(cuFuncSetCacheConfig);
 #ifdef CUDA_INIT_D3D11
@ -595,12 +576,10 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuGraphicsUnregisterResource);
        GET_PROC(cuGraphicsSubResourceGetMappedArray);
-        if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
+        if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
        {
            GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
        }
-        else
+        else {
        {
            GET_PROC(cuGraphicsResourceGetMappedPointer);
        }
@ -610,8 +589,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuGetExportTable);
    }
-    if (driverVer >= 2030)
+    if (driverVer >= 2030) {
    {
        GET_PROC(cuMemHostGetFlags);
 #ifdef CUDA_INIT_D3D10
        GET_PROC(cuD3D10GetDevice);
@ -624,8 +602,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 #endif
    }
-    if (driverVer >= 2010)
+    if (driverVer >= 2010) {
    {
        GET_PROC(cuModuleLoadDataEx);
        GET_PROC(cuModuleLoadFatBinary);
 #ifdef CUDA_INIT_OPENGL
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h
@ -43,7 +43,8 @@
 #define CUDA_VERSION 3020 /* 3.2 */
 #ifdef __cplusplus
-extern "C" {
+extern "C"
 {
 #endif
 /**
@ -81,8 +82,7 @@ typedef struct CUuuid_st                                  /**< CUDA definition o
    /**
     * Context creation flags
     */
-typedef enum CUctx_flags_enum
+    typedef enum CUctx_flags_enum {
 {
        CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
        CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
        CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
@ -103,8 +103,7 @@ typedef enum CUctx_flags_enum
    /**
     * Event creation flags
     */
-typedef enum CUevent_flags_enum
+    typedef enum CUevent_flags_enum {
 {
        CU_EVENT_DEFAULT        = 0, /**< Default event flag */
        CU_EVENT_BLOCKING_SYNC  = 1, /**< Event uses blocking synchronization */
        CU_EVENT_DISABLE_TIMING = 2  /**< Event will not record timing data */
@ -113,8 +112,7 @@ typedef enum CUevent_flags_enum
    /**
     * Array formats
     */
-typedef enum CUarray_format_enum
+    typedef enum CUarray_format_enum {
 {
        CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
@ -128,8 +126,7 @@ typedef enum CUarray_format_enum
    /**
     * Texture reference addressing modes
     */
-typedef enum CUaddress_mode_enum
+    typedef enum CUaddress_mode_enum {
 {
        CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
        CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
        CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
@ -139,8 +136,7 @@ typedef enum CUaddress_mode_enum
    /**
     * Texture reference filtering modes
     */
-typedef enum CUfilter_mode_enum
+    typedef enum CUfilter_mode_enum {
 {
        CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
        CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
    } CUfilter_mode;
@ -148,8 +144,7 @@ typedef enum CUfilter_mode_enum
    /**
     * Device properties
     */
-typedef enum CUdevice_attribute_enum
+    typedef enum CUdevice_attribute_enum {
 {
        CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK       = 1, /**< Maximum number of threads per block */
        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X             = 2, /**< Maximum block dimension X */
        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y             = 3, /**< Maximum block dimension Y */
@ -158,12 +153,15 @@ typedef enum CUdevice_attribute_enum
        CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y              = 6, /**< Maximum grid dimension Y */
        CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z              = 7, /**< Maximum grid dimension Z */
        CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */
-    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,            /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+        CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK =
-    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,              /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+            8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
        CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY =
            9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
        CU_DEVICE_ATTRIBUTE_WARP_SIZE               = 10, /**< Warp size in threads */
        CU_DEVICE_ATTRIBUTE_MAX_PITCH               = 11, /**< Maximum pitch in bytes allowed by memory copies */
        CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */
-    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,               /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+        CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK =
            12,                                     /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
        CU_DEVICE_ATTRIBUTE_CLOCK_RATE        = 13, /**< Peak clock frequency in kilohertz */
        CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */
        CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently */
@ -190,7 +188,8 @@ typedef enum CUdevice_attribute_enum
        CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */
        CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76  /**< Minor compute capability version number */
 #if __CUDA_API_VERSION >= 4000
-                                     , CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                 /**< Peak memory clock frequency in kilohertz */
+        ,
        CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE              = 36, /**< Peak memory clock frequency in kilohertz */
        CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH        = 37, /**< Global memory bus width in bits */
        CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE                  = 38, /**< Size of L2 cache in bytes */
        CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */
@ -221,8 +220,7 @@ typedef struct CUdevprop_st
    /**
     * Function properties
     */
-typedef enum CUfunction_attribute_enum
+    typedef enum CUfunction_attribute_enum {
 {
        /**
         * The maximum number of threads per block, beyond which a launch of the
         * function would fail. This number depends on both the function and the
@ -277,8 +275,7 @@ typedef enum CUfunction_attribute_enum
    /**
     * Function cache configurations
     */
-typedef enum CUfunc_cache_enum
+    typedef enum CUfunc_cache_enum {
 {
        CU_FUNC_CACHE_PREFER_NONE   = 0x00, /**< no preference for shared memory or L1 (default) */
        CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */
        CU_FUNC_CACHE_PREFER_L1     = 0x02  /**< prefer larger L1 cache and smaller shared memory */
@ -287,8 +284,7 @@ typedef enum CUfunc_cache_enum
    /**
     * Shared memory configurations
     */
-typedef enum CUsharedconfig_enum
+    typedef enum CUsharedconfig_enum {
 {
        CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
        CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
        CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
@ -297,33 +293,34 @@ typedef enum CUsharedconfig_enum
    /**
     * Memory types
     */
-typedef enum CUmemorytype_enum
+    typedef enum CUmemorytype_enum {
 {
        CU_MEMORYTYPE_HOST   = 0x01, /**< Host memory */
        CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */
        CU_MEMORYTYPE_ARRAY  = 0x03  /**< Array memory */
 #if __CUDA_API_VERSION >= 4000
-                            , CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
+        ,
        CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
 #endif
    } CUmemorytype;
    /**
     * Compute Modes
     */
-typedef enum CUcomputemode_enum
+    typedef enum CUcomputemode_enum {
 {
        CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */
-    CU_COMPUTEMODE_PROHIBITED        = 2  /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
+        CU_COMPUTEMODE_PROHIBITED =
            2 /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
 #if __CUDA_API_VERSION >= 4000
-                                       , CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
+        ,
        CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single
                                                process can be present on this device at a time) */
 #endif
    } CUcomputemode;
    /**
     * Online compiler options
     */
-typedef enum CUjit_option_enum
+    typedef enum CUjit_option_enum {
 {
        /**
         * Max number of registers that a thread may use.\n
         * Option type: unsigned int
@ -414,8 +411,7 @@ typedef enum CUjit_option_enum
    /**
     * Online compilation targets
     */
-typedef enum CUjit_target_enum
+    typedef enum CUjit_target_enum {
 {
        CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */
        CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */
        CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
@ -434,8 +430,7 @@ typedef enum CUjit_target_enum
    /**
     * Cubin matching fallback strategies
     */
-typedef enum CUjit_fallback_enum
+    typedef enum CUjit_fallback_enum {
 {
        CU_PREFER_PTX = 0, /**< Prefer to compile ptx */
        CU_PREFER_BINARY   /**< Prefer to fall back to compatible binary code */
    } CUjit_fallback;
@ -443,8 +438,7 @@ typedef enum CUjit_fallback_enum
    /**
     * Flags to register a graphics resource
     */
-typedef enum CUgraphicsRegisterFlags_enum
+    typedef enum CUgraphicsRegisterFlags_enum {
 {
        CU_GRAPHICS_REGISTER_FLAGS_NONE          = 0x00,
        CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY     = 0x01,
        CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
@ -454,8 +448,7 @@ typedef enum CUgraphicsRegisterFlags_enum
    /**
     * Flags for mapping and unmapping interop resources
     */
-typedef enum CUgraphicsMapResourceFlags_enum
+    typedef enum CUgraphicsMapResourceFlags_enum {
 {
        CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
        CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
        CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
@ -464,8 +457,7 @@ typedef enum CUgraphicsMapResourceFlags_enum
    /**
     * Array indices for cube faces
     */
-typedef enum CUarray_cubemap_face_enum
+    typedef enum CUarray_cubemap_face_enum {
 {
        CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */
        CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */
        CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */
@ -477,8 +469,7 @@ typedef enum CUarray_cubemap_face_enum
    /**
     * Limits
     */
-typedef enum CUlimit_enum
+    typedef enum CUlimit_enum {
 {
        CU_LIMIT_STACK_SIZE       = 0x00, /**< GPU thread stack size */
        CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */
        CU_LIMIT_MALLOC_HEAP_SIZE = 0x02  /**< GPU malloc heap size */
@ -487,8 +478,7 @@ typedef enum CUlimit_enum
    /**
     * Resource types
     */
-typedef enum CUresourcetype_enum
+    typedef enum CUresourcetype_enum {
 {
        CU_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resoure */
        CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
        CU_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
@ -498,8 +488,7 @@ typedef enum CUresourcetype_enum
    /**
     * Error codes
     */
-typedef enum cudaError_enum
+    typedef enum cudaError_enum {
 {
        /**
         * The API call returned with no errors. In the case of query calls, this
         * can also mean that the operation being queried is complete (see
@ -1064,8 +1053,7 @@ typedef struct CUDA_TEXTURE_DESC_st
    /**
     * Resource view format
     */
-typedef enum CUresourceViewFormat_enum
+    typedef enum CUresourceViewFormat_enum {
 {
        CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
        CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
        CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
@ -1130,7 +1118,6 @@ typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st
 #endif
 /**
 * If set, the CUDA array is a collection of layers, where each layer is either a 1D
 * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
@ -1420,7 +1407,11 @@ typedef CUresult  CUDAAPI tcuCtxSynchronize(void);
    typedef CUresult CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
    typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
-typedef CUresult  CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+    typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule     *module,
                                                 const void   *image,
                                                 unsigned int  numOptions,
                                                 CUjit_option *options,
                                                 void        **optionValues);
    typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
    typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod);
    typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
@ -1449,8 +1440,7 @@ typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
                                              size_t       Height,
                                              // size of biggest r/w to be performed by kernels on this memory
                                              // 4, 8 or 16 bytes
-                                          unsigned int ElementSizeBytes
+                                              unsigned int ElementSizeBytes);
                                         );
 #else
 typedef CUresult CUDAAPI tcuMemGetInfo(unsigned int *free, unsigned int *total);
 typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
@ -1461,8 +1451,7 @@ typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
                                          unsigned int  Height,
                                          // size of biggest r/w to be performed by kernels on this memory
                                          // 4, 8 or 16 bytes
-                                          unsigned int ElementSizeBytes
+                                          unsigned int ElementSizeBytes);
                                         );
 #endif
    typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr);
@ -1495,9 +1484,9 @@ typedef struct CUipcMemHandle_st
        char reserved[CU_IPC_HANDLE_SIZE];
    } CUipcMemHandle;
-typedef enum CUipcMem_flags_enum
+    typedef enum CUipcMem_flags_enum {
-{
+        CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS =
-    CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
+            0x1 /**< Automatically enable peer access between remote devices as needed */
    } CUipcMem_flags;
    typedef CUresult CUDAAPI tcuDeviceGetByPCIBusId(CUdevice *dev, char *pciBusId);
@ -1510,9 +1499,14 @@ typedef CUresult CUDAAPI tcuIpcCloseMemHandle(CUdeviceptr dptr);
 #endif
    typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
-typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);;
+    typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);
    ;
    typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
-typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+    typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice,
                                           CUcontext   dstContext,
                                           CUdeviceptr srcDevice,
                                           CUcontext   srcContext,
                                           size_t      ByteCount);
 /************************************
 **
@ -1541,7 +1535,8 @@ typedef CUresult  CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, cons
    typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
    // array <-> array memory
-typedef CUresult  CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    typedef CUresult CUDAAPI
    tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
 #else
 // system <-> device memory
 typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
@ -1551,15 +1546,28 @@ typedef CUresult  CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, un
 typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
 // device <-> array memory
-typedef CUresult  CUDAAPI tcuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray      dstArray,
-typedef CUresult  CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+                                       unsigned int dstOffset,
                                       CUdeviceptr  srcDevice,
                                       unsigned int ByteCount);
 typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr  dstDevice,
                                       CUarray      srcArray,
                                       unsigned int srcOffset,
                                       unsigned int ByteCount);
 // system <-> array memory
-typedef CUresult  CUDAAPI tcuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray      dstArray,
                                       unsigned int dstOffset,
                                       const void  *srcHost,
                                       unsigned int ByteCount);
 typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
 // array <-> array memory
-typedef CUresult  CUDAAPI tcuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray      dstArray,
                                       unsigned int dstOffset,
                                       CUarray      srcArray,
                                       unsigned int srcOffset,
                                       unsigned int ByteCount);
 #endif
    // 2D memcpy
@ -1586,36 +1594,51 @@ typedef CUresult  CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
 #if __CUDA_API_VERSION >= 3020
    // system <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
-                                             const void *srcHost, size_t ByteCount, CUstream hStream);
+                                                const void *srcHost,
                                                size_t      ByteCount,
                                                CUstream    hStream);
    typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void       *dstHost,
-                                             CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+                                                CUdeviceptr srcDevice,
                                                size_t      ByteCount,
                                                CUstream    hStream);
    // device <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
-                                             CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+                                                CUdeviceptr srcDevice,
                                                size_t      ByteCount,
                                                CUstream    hStream);
    // system <-> array memory
-typedef CUresult  CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
+    typedef CUresult CUDAAPI
-                                             const void *srcHost, size_t ByteCount, CUstream hStream);
+    tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
-typedef CUresult  CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset,
+    typedef CUresult CUDAAPI
-                                             size_t ByteCount, CUstream hStream);
+    tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
 #else
 // system <-> device memory
 typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr  dstDevice,
-                                             const void *srcHost, unsigned int ByteCount, CUstream hStream);
+                                            const void  *srcHost,
                                            unsigned int ByteCount,
                                            CUstream     hStream);
 typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void        *dstHost,
-                                             CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
+                                            CUdeviceptr  srcDevice,
                                            unsigned int ByteCount,
                                            CUstream     hStream);
 // device <-> device memory
 typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr  dstDevice,
-                                             CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
+                                            CUdeviceptr  srcDevice,
                                            unsigned int ByteCount,
                                            CUstream     hStream);
 // system <-> array memory
-typedef CUresult  CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset,
+typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray      dstArray,
-                                             const void *srcHost, unsigned int ByteCount, CUstream hStream);
+                                            unsigned int dstOffset,
-typedef CUresult  CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset,
+                                            const void  *srcHost,
-                                             unsigned int ByteCount, CUstream hStream);
+                                            unsigned int ByteCount,
                                            CUstream     hStream);
 typedef CUresult CUDAAPI
 tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
 #endif
    // 2D memcpy
@ -1634,13 +1657,22 @@ typedef CUresult  CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us,
    typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
 #if __CUDA_API_VERSION >= 3020
-typedef CUresult  CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
+    typedef CUresult CUDAAPI
-typedef CUresult  CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
+    tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
-typedef CUresult  CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
+    typedef CUresult CUDAAPI
    tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
    typedef CUresult CUDAAPI
    tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
 #else
-typedef CUresult  CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+typedef CUresult CUDAAPI
-typedef CUresult  CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
-typedef CUresult  CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr    dstDevice,
                                        unsigned int   dstPitch,
                                        unsigned short us,
                                        unsigned int   Width,
                                        unsigned int   Height);
 typedef CUresult CUDAAPI
 tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
 #endif
    /************************************
@ -1657,10 +1689,16 @@ typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache co
    typedef CUresult CUDAAPI tcuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
    typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction   f,
-                                         unsigned int gridDimX,  unsigned int gridDimY,  unsigned int gridDimZ,
+                                             unsigned int gridDimX,
-                                         unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+                                             unsigned int gridDimY,
                                             unsigned int gridDimZ,
                                             unsigned int blockDimX,
                                             unsigned int blockDimY,
                                             unsigned int blockDimZ,
                                             unsigned int sharedMemBytes,
-                                         CUstream hStream, void **kernelParams, void **extra);
+                                             CUstream     hStream,
                                             void       **kernelParams,
                                             void       **extra);
    /************************************
     **
@ -1676,8 +1714,12 @@ typedef CUresult  CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_
    typedef CUresult CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
 #if __CUDA_API_VERSION >= 5000
-typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+    typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray              *pHandle,
-typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
+                                                     const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
                                                     unsigned int                   numMipmapLevels);
    typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray         *pLevelArray,
                                                       CUmipmappedArray hMipmappedArray,
                                                       unsigned int     level);
    typedef CUresult CUDAAPI tcuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
 #endif
@ -1694,10 +1736,19 @@ typedef CUresult  CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, un
 #if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
-typedef CUresult  CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+    typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref                     hTexRef,
                                                   const CUDA_ARRAY_DESCRIPTOR *desc,
                                                   CUdeviceptr                  dptr,
                                                   size_t                       Pitch);
 #else
-typedef CUresult  CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
+typedef CUresult CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset,
-typedef CUresult  CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
+                                             CUtexref      hTexRef,
                                             CUdeviceptr   dptr,
                                             unsigned int  bytes);
 typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref                     hTexRef,
                                               const CUDA_ARRAY_DESCRIPTOR *desc,
                                               CUdeviceptr                  dptr,
                                               unsigned int                 Pitch);
 #endif
    typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
@ -1763,7 +1814,10 @@ typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStar
     ***********************************/
    typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags);
    typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
-typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+    typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream         hStream,
                                                  CUstreamCallback callback,
                                                  void            *userData,
                                                  unsigned int     flags);
    typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
    typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream);
@ -1775,17 +1829,28 @@ typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream);
     **
     ***********************************/
    typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
-typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+    typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray           *pArray,
                                                                  CUgraphicsResource resource,
                                                                  unsigned int       arrayIndex,
                                                                  unsigned int       mipLevel);
 #if __CUDA_API_VERSION >= 3020
-typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+    typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr       *pDevPtr,
                                                                 size_t            *pSize,
                                                                 CUgraphicsResource resource);
 #else
-typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr       *pDevPtr,
                                                             unsigned int      *pSize,
                                                             CUgraphicsResource resource);
 #endif
    typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
-typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+    typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int        count,
-typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+                                                     CUgraphicsResource *resources,
                                                     CUstream            hStream);
    typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int        count,
                                                       CUgraphicsResource *resources,
                                                       CUstream            hStream);
    /************************************
     **
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h
@ -14,21 +14,17 @@
 #ifndef HELPER_CUDA_DRVAPI_H
 #define HELPER_CUDA_DRVAPI_H
 #include <helper_string.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <helper_string.h>
 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
 #endif
 #ifndef HELPER_CUDA_DRVAPI_H
-inline int ftoi(float value) {
+inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
  return (value >= 0 ? static_cast<int>(value + 0.5)
                     : static_cast<int>(value - 0.5));
 }
 #endif
 #ifndef EXIT_WAIVED
@ -47,39 +43,43 @@ inline int ftoi(float value) {
 #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
 // These are the inline versions for all of the SDK helper functions
-inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
+inline void __checkCudaErrors(CUresult err, const char *file, const int line)
 {
    if (CUDA_SUCCESS != err) {
        const char *errorStr = NULL;
        cuGetErrorString(err, &errorStr);
        fprintf(stderr,
                "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
                "line %i.\n",
-            err, errorStr, file, line);
+                err,
                errorStr,
                file,
                line);
        exit(EXIT_FAILURE);
    }
 }
 #endif
 // This function wraps the CUDA Driver API into a template function
-template <class T>
+template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
-inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
+{
                             int device) {
    checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
 }
 #endif
 // Beginning of GPU Architecture definitions
-inline int _ConvertSMVer2CoresDRV(int major, int minor) {
+inline int _ConvertSMVer2CoresDRV(int major, int minor)
 {
    // Defines for GPU Architecture types (using the SM version to determine the #
    // of cores per SM
-  typedef struct {
+    typedef struct
    {
        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
                // minor version
        int Cores;
    } sSMtoCores;
-  sSMtoCores nGpuArchCoresPerSM[] = {
+    sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192},
      {0x30, 192},
                                       {0x32, 192},
                                       {0x35, 192},
                                       {0x37, 192},
@ -110,16 +110,18 @@ inline int _ConvertSMVer2CoresDRV(int major, int minor) {
    // If we don't find the values, we default use the previous one to run
    // properly
-  printf(
+    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
-      "MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
+           major,
-      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+           minor,
           nGpuArchCoresPerSM[index - 1].Cores);
    return nGpuArchCoresPerSM[index - 1].Cores;
 }
 // end of GPU Architecture definitions
 #ifdef __cuda_cuda_h__
 // General GPU Device CUDA Initialization
-inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
+inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
 {
    int cuDevice    = 0;
    int deviceCount = 0;
    checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
@ -140,11 +142,8 @@ inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
    if (dev > deviceCount - 1) {
        fprintf(stderr, "\n");
-    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
-            deviceCount);
+        fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
    fprintf(stderr,
            ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n",
            dev);
        fprintf(stderr, "\n");
        return -dev;
    }
@ -171,7 +170,8 @@ inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
 }
 // This function returns the best GPU based on performance
-inline int gpuGetMaxGflopsDeviceIdDRV() {
+inline int gpuGetMaxGflopsDeviceIdDRV()
 {
    CUdevice           current_device   = 0;
    CUdevice           max_perf_device  = 0;
    int                device_count     = 0;
@ -187,8 +187,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
    checkCudaErrors(cuDeviceGetCount(&device_count));
    if (device_count == 0) {
-    fprintf(stderr,
+        fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
            "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
        exit(EXIT_FAILURE);
    }
@ -196,36 +195,31 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
    current_device = 0;
    while (current_device < device_count) {
-    checkCudaErrors(cuDeviceGetAttribute(
+        checkCudaErrors(
-        &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+            cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device));
-        current_device));
+        checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
-    checkCudaErrors(cuDeviceGetAttribute(
+        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
-        &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
+        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
    checkCudaErrors(cuDeviceGetAttribute(
        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
    checkCudaErrors(cuDeviceGetAttribute(
        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
        int computeMode;
-    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
+        getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
                          current_device);
        if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
            if (major == 9999 && minor == 9999) {
                sm_per_multiproc = 1;
-      } else {
+            }
            else {
                sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
            }
-      unsigned long long compute_perf =
+            unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate);
          (unsigned long long)(multiProcessorCount * sm_per_multiproc *
                               clockRate);
            if (compute_perf > max_compute_perf) {
                max_compute_perf = compute_perf;
                max_perf_device  = current_device;
            }
-    } else {
+        }
        else {
            devices_prohibited++;
        }
@ -243,7 +237,8 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
 }
 // General initialization call to pick the best CUDA Device
-inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
+inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
 {
    CUdevice cuDevice;
    int      devID = 0;
@ -255,7 +250,8 @@ inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
            printf("exiting...\n");
            exit(EXIT_SUCCESS);
        }
-  } else {
+    }
    else {
        // Otherwise pick the device with highest Gflops/s
        char name[100];
        devID = gpuGetMaxGflopsDeviceIdDRV();
@ -269,7 +265,8 @@ inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
    return cuDevice;
 }
-inline CUdevice findIntegratedGPUDrv() {
+inline CUdevice findIntegratedGPUDrv()
 {
    CUdevice current_device     = 0;
    int      device_count       = 0;
    int      devices_prohibited = 0;
@ -286,28 +283,22 @@ inline CUdevice findIntegratedGPUDrv() {
    // Find the integrated GPU which is compute capable
    while (current_device < device_count) {
        int computeMode = -1;
-    checkCudaErrors(cuDeviceGetAttribute(
+        checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
-        &isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
+        checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
    checkCudaErrors(cuDeviceGetAttribute(
        &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
        // If GPU is integrated and is not running on Compute Mode prohibited use
        // that
        if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
            int  major = 0, minor = 0;
            char deviceName[256];
-      checkCudaErrors(cuDeviceGetAttribute(
+            checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
-          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+            checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
          current_device));
      checkCudaErrors(cuDeviceGetAttribute(
          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
          current_device));
            checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
-      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+            printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor);
             current_device, deviceName, major, minor);
            return current_device;
-    } else {
+        }
        else {
            devices_prohibited++;
        }
@ -323,29 +314,26 @@ inline CUdevice findIntegratedGPUDrv() {
 }
 // General check for CUDA GPU SM Capabilities
-inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
+inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
-                                     int devID) {
+{
    CUdevice cuDevice;
    char     name[256];
    int      major = 0, minor = 0;
    checkCudaErrors(cuDeviceGet(&cuDevice, devID));
    checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
-  checkCudaErrors(cuDeviceGetAttribute(
+    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
-      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
  checkCudaErrors(cuDeviceGetAttribute(
      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
-  if ((major > major_version) ||
+    if ((major > major_version) || (major == major_version && minor >= minor_version)) {
-      (major == major_version && minor >= minor_version)) {
+        printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
    printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name,
           major, minor);
        return true;
-  } else {
+    }
-    printf(
+    else {
-        "No GPU device was found that can support CUDA compute capability "
+        printf("No GPU device was found that can support CUDA compute capability "
               "%d.%d.\n",
-        major_version, minor_version);
+               major_version,
               minor_version);
        return false;
    }
 }
@ -354,4 +342,3 @@ inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
 // end of CUDA Helper Functions
 #endif // HELPER_CUDA_DRVAPI_H
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMulDynlinkJIT.cpp
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMulDynlinkJIT.cpp
@ -43,10 +43,10 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes, CUDA
 #include "cuda_drvapi_dynlink.h"
@ -78,8 +78,7 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
 ////////////////////////////////////////////////////////////////////////////////
 void randomInit(float *data, size_t size)
 {
-    for (size_t i = 0; i < size; ++i)
+    for (size_t i = 0; i < size; ++i) {
    {
        data[i] = rand() / (float)RAND_MAX;
    }
 }
@ -100,18 +99,14 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
    checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
    // This assumes that the user is attempting to specify a explicit device -device=n
-    if (argc > 1)
+    if (argc > 1) {
    {
        bool bFound = false;
-        for (int param=0; param < argc; param++)
+        for (int param = 0; param < argc; param++) {
-        {
+            if (!strncmp(argv[param], "-device", 7)) {
            if (!strncmp(argv[param], "-device", 7))
            {
                int i = (int)strlen(argv[1]);
-                while (argv[1][i] != '=')
+                while (argv[1][i] != '=') {
                {
                    i--;
                }
@ -128,16 +123,15 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
    int deviceCount = 0;
    checkCudaErrors(cuDeviceGetCount(&deviceCount));
-    if (deviceCount == 0)
+    if (deviceCount == 0) {
    {
        fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
        exit(EXIT_SUCCESS);
    }
-    if (devID < 0) devID = 0;
+    if (devID < 0)
        devID = 0;
-    if (devID > deviceCount -1)
+    if (devID > deviceCount - 1) {
    {
        fprintf(stderr, "initCUDA (Device=%d) invalid GPU device.  %d GPU device(s) detected.\n\n", devID, deviceCount);
        status = CUDA_ERROR_NOT_FOUND;
@ -159,8 +153,7 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
    // create context for picked device
    status = cuCtxCreate(&g_cuContext, 0, cuDevice);
-    if (CUDA_SUCCESS != status)
+    if (CUDA_SUCCESS != status) {
    {
        cuCtxDestroy(g_cuContext);
        exit(EXIT_SUCCESS);
    }
@ -191,9 +184,11 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
        printf("> Compiling CUDA module\n");
 #if defined(_WIN64) || defined(__LP64__)
-        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
+        status =
            cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
 #else
-        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
+        status =
            cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
 #endif
        printf("> PTX JIT log:\n%s\n", jitLogBuffer);
@ -203,19 +198,17 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
        delete[] jitLogBuffer;
    }
-    if (CUDA_SUCCESS != status)
+    if (CUDA_SUCCESS != status) {
    {
        printf("Error while compiling PTX\n");
        cuCtxDestroy(g_cuContext);
        exit(EXIT_FAILURE);
    }
    // retrieve CUDA function from the compiled module
-    status = cuModuleGetFunction(&cuFunction, cuModule,
+    status = cuModuleGetFunction(
-                                 (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
+        &cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
-    if (CUDA_SUCCESS != status)
+    if (CUDA_SUCCESS != status) {
    {
        cuCtxDestroy(g_cuContext);
        exit(EXIT_FAILURE);
    }
@ -280,10 +273,8 @@ int main(int argc, char **argv)
        int   Matrix_Width_B = WB;
        void *args[5]        = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
-        checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1,
+        checkCudaErrors(cuLaunchKernel(
-                                       block_size     , block_size     , 1,
+            matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
                                       0,
                                       NULL, args, NULL));
    }
 #else // __CUDA_API_VERSION <= 3020
    {
@ -331,8 +322,7 @@ int main(int argc, char **argv)
    // check result
    float diff = 0.0f;
-    for (unsigned int i=0; i<size_C; i++)
+    for (unsigned int i = 0; i < size_C; i++) {
    {
        float tmp = reference[i] - h_C[i];
        diff += tmp * tmp;
    }
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_gold.cpp
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_gold.cpp
@ -28,8 +28,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 // export C interface
-extern "C"
+extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
 void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
 ////////////////////////////////////////////////////////////////////////////////
 //! Compute reference data set
@ -40,16 +39,13 @@ void computeGold(float *, const float *, const float *, unsigned int, unsigned i
 //! @param hA         height of matrix A
 //! @param wB         width of matrix B
 ////////////////////////////////////////////////////////////////////////////////
-void
+void computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
 computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
 {
    for (unsigned int i = 0; i < hA; ++i)
-        for (unsigned int j = 0; j < wB; ++j)
+        for (unsigned int j = 0; j < wB; ++j) {
        {
            double sum = 0;
-            for (unsigned int k = 0; k < wA; ++k)
+            for (unsigned int k = 0; k < wA; ++k) {
            {
                double a = A[i * wA + k];
                double b = B[k * wB + j];
                sum += a * b;
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.c
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.c
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.h
@ -32,7 +32,8 @@
 #define __matrixMul_kernel_32_ptxdump_h__
 #if defined __cplusplus
-extern "C" {
+extern "C"
 {
 #endif
    extern unsigned char matrixMul_kernel_32_ptxdump[25784];
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.c
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.c
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.h
@ -32,7 +32,8 @@
 #define __matrixMul_kernel_64_ptxdump_h__
 #if defined __cplusplus
-extern "C" {
+extern "C"
 {
 #endif
    extern unsigned char matrixMul_kernel_64_ptxdump[26489];
--- a/Samples/0_Introduction/matrixMul_nvrtc/matrixMul.cpp
+++ b/Samples/0_Introduction/matrixMul_nvrtc/matrixMul.cpp
@ -42,17 +42,19 @@
 */
 // System includes
 #include <stdio.h>
 #include <assert.h>
 #include <stdio.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 #include "nvrtc_helper.h"
 // Helper functions and utilities to work with CUDA
 #include <helper_functions.h>
-void constantInit(float *data, int size, float val) {
+void constantInit(float *data, int size, float val)
 {
    for (int i = 0; i < size; ++i) {
        data[i] = val;
    }
@ -61,8 +63,8 @@ void constantInit(float *data, int size, float val) {
 /**
 * Run a simple test of matrix multiplication using CUDA
 */
-int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
+int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
-                   dim3 &dimsB) {
+{
    // Allocate host memory for matrices A and B
    unsigned int size_A     = dimsA.x * dimsA.y;
    unsigned int mem_size_A = sizeof(float) * size_A;
@ -114,24 +116,27 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
    CUfunction kernel_addr;
    if (block_size == 16) {
-    checkCudaErrors(
+        checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
-        cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
+    }
-  } else {
+    else {
-    checkCudaErrors(
+        checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
        cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
    }
-  void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
+    void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x};
                 (void *)&dimsB.x};
    // Execute the kernel
    int nIter = 300;
    for (int j = 0; j < nIter; j++) {
-    checkCudaErrors(
+        checkCudaErrors(cuLaunchKernel(kernel_addr,
-        cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
+                                       grid.x,
-                       threads.x, threads.y, threads.z,     /* block dim */
+                                       grid.y,
-                       0, 0,    /* shared mem, stream */
+                                       grid.z, /* grid dim */
                                       threads.x,
                                       threads.y,
                                       threads.z, /* block dim */
                                       0,
                                       0,       /* shared mem, stream */
                                       &arr[0], /* arguments */
                                       0));
@ -157,16 +162,14 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
        double rel_err    = abs_err / abs_val / dot_length;
        if (rel_err > eps) {
-      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
+            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
             h_C[i], dimsA.x * valB, eps);
            correct = false;
        }
    }
    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
-  printf(
+    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
           "Results may vary when GPU Boost is enabled.\n");
    // Clean up memory
@ -180,7 +183,8 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
    if (correct) {
        return EXIT_SUCCESS;
-  } else {
+    }
    else {
        return EXIT_FAILURE;
    }
 }
@ -189,16 +193,15 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
 * Program main
 */
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("[Matrix Multiply Using CUDA] - Starting...\n");
-  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
+    if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
      checkCmdLineFlag(argc, (const char **)argv, "?")) {
        printf("Usage -device=n (n >= 0 for deviceID)\n");
        printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
        printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
-    printf(
+        printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
        "  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
        exit(EXIT_SUCCESS);
    }
@ -234,13 +237,11 @@ int main(int argc, char **argv) {
    }
    if (dimsA.x != dimsB.y) {
-    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
+        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
           dimsA.x, dimsB.y);
        exit(EXIT_FAILURE);
    }
-  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
+    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
         dimsB.y);
    int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
--- a/Samples/0_Introduction/matrixMul_nvrtc/matrixMul_kernel.cu
+++ b/Samples/0_Introduction/matrixMul_nvrtc/matrixMul_kernel.cu
@ -48,11 +48,10 @@
 #include <cooperative_groups.h>
-template <int BLOCK_SIZE>
+template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
-__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
+{
    // Handle to thread block group
-  cooperative_groups::thread_block cta =
+    cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
      cooperative_groups::this_thread_block();
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;
@ -120,12 +119,12 @@ __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
    C[c + wB * ty + tx] = Csub;
 }
-extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
+extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
-                                                 int wA, int wB) {
+{
    matrixMulCUDA<16>(C, A, B, wA, wB);
 }
-extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
+extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
-                                                 int wA, int wB) {
+{
    matrixMulCUDA<32>(C, A, B, wA, wB);
 }
--- a/Samples/0_Introduction/mergeSort/bitonic.cu
+++ b/Samples/0_Introduction/mergeSort/bitonic.cu
@ -28,12 +28,13 @@
 #include <cooperative_groups.h>
 namespace cg = cooperative_groups;
 #include <helper_cuda.h>
 #include <assert.h>
 #include <helper_cuda.h>
 #include "mergeSort_common.h"
-inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
+inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir)
-                                  uint &valB, uint arrowDir) {
+{
    uint t;
    if ((keyA > keyB) == arrowDir) {
@ -46,9 +47,9 @@ inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
    }
 }
-__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
+__global__ void
-                                        uint *d_SrcKey, uint *d_SrcVal,
+bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir)
-                                        uint arrayLength, uint sortDir) {
+{
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    // Shared memory storage for one or more short vectors
@ -62,10 +63,8 @@ __global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    s_key[threadIdx.x + 0]                       = d_SrcKey[0];
    s_val[threadIdx.x + 0]                       = d_SrcVal[0];
-  s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
+    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
-      d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
+    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
  s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
      d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
    for (uint size = 2; size < arrayLength; size <<= 1) {
        // Bitonic merge
@ -74,8 +73,7 @@ __global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
        for (uint stride = size / 2; stride > 0; stride >>= 1) {
            cg::sync(cta);
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-      Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
+            Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir);
                 s_val[pos + stride], dir);
        }
    }
@ -84,26 +82,25 @@ __global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
        for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
            cg::sync(cta);
            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-      Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
+            Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir);
                 s_val[pos + stride], sortDir);
        }
    }
    cg::sync(cta);
    d_DstKey[0]                       = s_key[threadIdx.x + 0];
    d_DstVal[0]                       = s_val[threadIdx.x + 0];
-  d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
+    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-      s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
  d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
      s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 }
 // Helper function (also used by odd-even merge sort)
-extern "C" uint factorRadix2(uint *log2L, uint L) {
+extern "C" uint factorRadix2(uint *log2L, uint L)
 {
    if (!L) {
        *log2L = 0;
        return 0;
-  } else {
+    }
    else {
        for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
            ;
@ -111,10 +108,14 @@ extern "C" uint factorRadix2(uint *log2L, uint L) {
    }
 }
-extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
+extern "C" void bitonicSortShared(uint *d_DstKey,
-                                  uint *d_SrcKey, uint *d_SrcVal,
+                                  uint *d_DstVal,
-                                  uint batchSize, uint arrayLength,
+                                  uint *d_SrcKey,
-                                  uint sortDir) {
+                                  uint *d_SrcVal,
                                  uint  batchSize,
                                  uint  arrayLength,
                                  uint  sortDir)
 {
    // Nothing to sort
    if (arrayLength < 2) {
        return;
@ -131,32 +132,25 @@ extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
    assert(arrayLength <= SHARED_SIZE_LIMIT);
    assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
-  bitonicSortSharedKernel<<<blockCount, threadCount>>>(
+    bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
      d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
    getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 3: merge elementary intervals
 ////////////////////////////////////////////////////////////////////////////////
-static inline __host__ __device__ uint iDivUp(uint a, uint b) {
+static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
  return ((a % b) == 0) ? (a / b) : (a / b + 1);
 }
-static inline __host__ __device__ uint getSampleCount(uint dividend) {
+static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
  return iDivUp(dividend, SAMPLE_STRIDE);
 }
 template <uint sortDir>
-static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
+static inline __device__ void
-                                                 uint &flagA, uint &keyB,
+ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir)
-                                                 uint &valB, uint &flagB,
+{
                                                 uint arrowDir) {
    uint t;
-  if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) ||
+    if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1))
-      ((arrowDir == sortDir) && (flagA == 1)) ||
+        || ((arrowDir != sortDir) && (flagB == 1))) {
      ((arrowDir != sortDir) && (flagB == 1))) {
        t     = keyA;
        keyA  = keyB;
        keyB  = t;
@ -170,9 +164,15 @@ static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
 }
 template <uint sortDir>
-__global__ void bitonicMergeElementaryIntervalsKernel(
+__global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
-    uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal,
+                                                      uint *d_DstVal,
-    uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) {
+                                                      uint *d_SrcKey,
                                                      uint *d_SrcVal,
                                                      uint *d_LimitsA,
                                                      uint *d_LimitsB,
                                                      uint  stride,
                                                      uint  N)
 {
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ uint  s_key[2 * SAMPLE_STRIDE];
@ -200,10 +200,8 @@ __global__ void bitonicMergeElementaryIntervalsKernel(
        startSrcB = d_LimitsB[blockIdx.x];
        startDst  = startSrcA + startSrcB;
-    uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
+        uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
-                                                    : segmentElementsA;
+        uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
    uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
                                                    : segmentElementsB;
        lenSrcA      = endSrcA - startSrcA;
        lenSrcB      = endSrcB - startSrcB;
    }
@ -222,10 +220,8 @@ __global__ void bitonicMergeElementaryIntervalsKernel(
    // Prepare for bitonic merge by inversing the ordering
    if (threadIdx.x < lenSrcB) {
-    s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
+        s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
-        d_SrcKey[stride + startSrcB + threadIdx.x];
+        s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
    s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
        d_SrcVal[stride + startSrcB + threadIdx.x];
        s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
    }
@ -233,9 +229,13 @@ __global__ void bitonicMergeElementaryIntervalsKernel(
    for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
        cg::sync(cta);
        uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-    ComparatorExtended<sortDir>(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0],
+        ComparatorExtended<sortDir>(s_key[pos + 0],
-                                s_key[pos + stride], s_val[pos + stride],
+                                    s_val[pos + 0],
-                                s_inf[pos + stride], sortDir);
+                                    s_inf[pos + 0],
                                    s_key[pos + stride],
                                    s_val[pos + stride],
                                    s_inf[pos + stride],
                                    sortDir);
    }
    // Store sorted data
@ -254,26 +254,28 @@ __global__ void bitonicMergeElementaryIntervalsKernel(
    }
 }
-extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
+extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
-                                                uint *d_SrcKey, uint *d_SrcVal,
+                                                uint *d_DstVal,
                                                uint *d_SrcKey,
                                                uint *d_SrcVal,
                                                uint *d_LimitsA,
-                                                uint *d_LimitsB, uint stride,
+                                                uint *d_LimitsB,
-                                                uint N, uint sortDir) {
+                                                uint  stride,
                                                uint  N,
                                                uint  sortDir)
 {
    uint lastSegmentElements = N % (2 * stride);
-  uint mergePairs = (lastSegmentElements > stride)
+    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
                        ? getSampleCount(N)
                        : (N - lastSegmentElements) / SAMPLE_STRIDE;
    if (sortDir) {
-    bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
+        bitonicMergeElementaryIntervalsKernel<1U>
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
+            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
        N);
        getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
-  } else {
+    }
-    bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
+    else {
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
+        bitonicMergeElementaryIntervalsKernel<0U>
-        N);
+            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
        getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
    }
 }
--- a/Samples/0_Introduction/mergeSort/main.cpp
+++ b/Samples/0_Introduction/mergeSort/main.cpp
@ -26,17 +26,19 @@
 */
 #include <assert.h>
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cuda_runtime.h>
+
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include "mergeSort_common.h"
 ////////////////////////////////////////////////////////////////////////////////
 // Test driver
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    uint               *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
    uint               *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
    StopWatchInterface *hTimer = NULL;
@ -75,10 +77,8 @@ int main(int argc, char **argv) {
    checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
-      cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
  checkCudaErrors(
      cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
    printf("Initializing GPU merge sort...\n");
    initMergeSort();
@ -93,10 +93,8 @@ int main(int argc, char **argv) {
    printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
    printf("Reading back GPU merge sort results...\n");
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
-      cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
  checkCudaErrors(
      cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
    printf("Inspecting the results...\n");
    uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
--- a/Samples/0_Introduction/mergeSort/mergeSort.cu
+++ b/Samples/0_Introduction/mergeSort/mergeSort.cu
@ -39,21 +39,19 @@
 namespace cg = cooperative_groups;
 #include <helper_cuda.h>
 #include "mergeSort_common.h"
 ////////////////////////////////////////////////////////////////////////////////
 // Helper functions
 ////////////////////////////////////////////////////////////////////////////////
-static inline __host__ __device__ uint iDivUp(uint a, uint b) {
+static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
  return ((a % b) == 0) ? (a / b) : (a / b + 1);
 }
-static inline __host__ __device__ uint getSampleCount(uint dividend) {
+static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
  return iDivUp(dividend, SAMPLE_STRIDE);
 }
 #define W (sizeof(uint) * 8)
-static inline __device__ uint nextPowerOfTwo(uint x) {
+static inline __device__ uint nextPowerOfTwo(uint x)
 {
    /*
        --x;
        x |= x >> 1;
@ -66,9 +64,8 @@ static inline __device__ uint nextPowerOfTwo(uint x) {
    return 1U << (W - __clz(x - 1));
 }
-template <uint sortDir>
+template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
-static inline __device__ uint binarySearchInclusive(uint val, uint *data,
+{
                                                    uint L, uint stride) {
    if (L == 0) {
        return 0;
    }
@ -78,8 +75,7 @@ static inline __device__ uint binarySearchInclusive(uint val, uint *data,
    for (; stride > 0; stride >>= 1) {
        uint newPos = umin(pos + stride, L);
-    if ((sortDir && (data[newPos - 1] <= val)) ||
+        if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
        (!sortDir && (data[newPos - 1] >= val))) {
            pos = newPos;
        }
    }
@ -87,9 +83,8 @@ static inline __device__ uint binarySearchInclusive(uint val, uint *data,
    return pos;
 }
-template <uint sortDir>
+template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
-static inline __device__ uint binarySearchExclusive(uint val, uint *data,
+{
                                                    uint L, uint stride) {
    if (L == 0) {
        return 0;
    }
@ -99,8 +94,7 @@ static inline __device__ uint binarySearchExclusive(uint val, uint *data,
    for (; stride > 0; stride >>= 1) {
        uint newPos = umin(pos + stride, L);
-    if ((sortDir && (data[newPos - 1] < val)) ||
+        if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
        (!sortDir && (data[newPos - 1] > val))) {
            pos = newPos;
        }
    }
@ -112,9 +106,8 @@ static inline __device__ uint binarySearchExclusive(uint val, uint *data,
 // Bottom-level merge sort (binary search-based)
 ////////////////////////////////////////////////////////////////////////////////
 template <uint sortDir>
-__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
+__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength)
-                                      uint *d_SrcKey, uint *d_SrcVal,
+{
                                      uint arrayLength) {
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ uint  s_key[SHARED_SIZE_LIMIT];
@ -126,10 +119,8 @@ __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
    s_key[threadIdx.x + 0]                       = d_SrcKey[0];
    s_val[threadIdx.x + 0]                       = d_SrcVal[0];
-  s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
+    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
-      d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
+    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
  s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
      d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
    for (uint stride = 1; stride < arrayLength; stride <<= 1) {
        uint  lPos    = threadIdx.x & (stride - 1);
@ -141,12 +132,8 @@ __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
        uint valA = baseVal[lPos + 0];
        uint keyB = baseKey[lPos + stride];
        uint valB = baseVal[lPos + stride];
-    uint posA =
+        uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
-        binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) +
+        uint posB = binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) + lPos;
        lPos;
    uint posB =
        binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) +
        lPos;
        cg::sync(cta);
        baseKey[posA] = keyA;
@ -158,15 +145,18 @@ __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
    cg::sync(cta);
    d_DstKey[0]                       = s_key[threadIdx.x + 0];
    d_DstVal[0]                       = s_val[threadIdx.x + 0];
-  d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
+    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-      s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
  d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
      s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 }
-static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
+static void mergeSortShared(uint *d_DstKey,
-                            uint *d_SrcVal, uint batchSize, uint arrayLength,
+                            uint *d_DstVal,
-                            uint sortDir) {
+                            uint *d_SrcKey,
                            uint *d_SrcVal,
                            uint  batchSize,
                            uint  arrayLength,
                            uint  sortDir)
 {
    if (arrayLength < 2) {
        return;
    }
@ -177,12 +167,11 @@ static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
    uint threadCount = SHARED_SIZE_LIMIT / 2;
    if (sortDir) {
-    mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(
+        mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
        getLastCudaError("mergeSortShared<1><<<>>> failed\n");
-  } else {
+    }
-    mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(
+    else {
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
+        mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
        getLastCudaError("mergeSortShared<0><<<>>> failed\n");
    }
 }
@ -191,9 +180,9 @@ static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
 // Merge step 1: generate sample ranks
 ////////////////////////////////////////////////////////////////////////////////
 template <uint sortDir>
-__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
+__global__ void
-                                          uint *d_SrcKey, uint stride, uint N,
+generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount)
-                                          uint threadCount) {
+{
    uint pos = blockIdx.x * blockDim.x + threadIdx.x;
    if (pos >= threadCount) {
@ -214,33 +203,30 @@ __global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
    if (i < segmentSamplesA) {
        d_RanksA[i] = i * SAMPLE_STRIDE;
        d_RanksB[i] = binarySearchExclusive<sortDir>(
-        d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB,
+            d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB));
        nextPowerOfTwo(segmentElementsB));
    }
    if (i < segmentSamplesB) {
        d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
        d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
-        d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA,
+            d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA));
        nextPowerOfTwo(segmentElementsA));
    }
 }
-static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
+static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir)
-                                uint stride, uint N, uint sortDir) {
+{
    uint lastSegmentElements = N % (2 * stride);
-  uint threadCount =
+    uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
      (lastSegmentElements > stride)
          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
    if (sortDir) {
-    generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>(
+        generateSampleRanksKernel<1U>
-        d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
+            <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
        getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
-  } else {
+    }
-    generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>(
+    else {
-        d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
+        generateSampleRanksKernel<0U>
            <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
        getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
    }
 }
@ -248,9 +234,8 @@ static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 2: generate sample ranks and indices
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
+__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount)
-                                           uint stride, uint N,
+{
                                           uint threadCount) {
    uint pos = blockIdx.x * blockDim.x + threadIdx.x;
    if (pos >= threadCount) {
@ -269,36 +254,29 @@ __global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
    if (i < segmentSamplesA) {
        uint dstPos = binarySearchExclusive<1U>(
-                      d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB,
+                          d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB))
-                      nextPowerOfTwo(segmentSamplesB)) +
+                    + i;
                  i;
        d_Limits[dstPos] = d_Ranks[i];
    }
    if (i < segmentSamplesB) {
-    uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i],
+        uint dstPos = binarySearchInclusive<1U>(
-                                            d_Ranks, segmentSamplesA,
+                          d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA))
-                                            nextPowerOfTwo(segmentSamplesA)) +
+                    + i;
                  i;
        d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
    }
 }
-static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
+static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N)
-                                 uint *d_RanksA, uint *d_RanksB, uint stride,
+{
                                 uint N) {
    uint lastSegmentElements = N % (2 * stride);
-  uint threadCount =
+    uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
      (lastSegmentElements > stride)
          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
-  mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
+    mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsA, d_RanksA, stride, N, threadCount);
      d_LimitsA, d_RanksA, stride, N, threadCount);
    getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
-  mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
+    mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsB, d_RanksB, stride, N, threadCount);
      d_LimitsB, d_RanksB, stride, N, threadCount);
    getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
 }
@ -306,24 +284,30 @@ static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
 // Merge step 3: merge elementary intervals
 ////////////////////////////////////////////////////////////////////////////////
 template <uint sortDir>
-inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
+inline __device__ void merge(uint            *dstKey,
-                             uint *srcAVal, uint *srcBKey, uint *srcBVal,
+                             uint            *dstVal,
-                             uint lenA, uint nPowTwoLenA, uint lenB,
+                             uint            *srcAKey,
-                             uint nPowTwoLenB, cg::thread_block cta) {
+                             uint            *srcAVal,
                             uint            *srcBKey,
                             uint            *srcBVal,
                             uint             lenA,
                             uint             nPowTwoLenA,
                             uint             lenB,
                             uint             nPowTwoLenB,
                             cg::thread_block cta)
 {
    uint keyA, valA, keyB, valB, dstPosA, dstPosB;
    if (threadIdx.x < lenA) {
        keyA    = srcAKey[threadIdx.x];
        valA    = srcAVal[threadIdx.x];
-    dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) +
+        dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
              threadIdx.x;
    }
    if (threadIdx.x < lenB) {
        keyB    = srcBKey[threadIdx.x];
        valB    = srcBVal[threadIdx.x];
-    dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) +
+        dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
              threadIdx.x;
    }
    cg::sync(cta);
@ -340,10 +324,15 @@ inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
 }
 template <uint sortDir>
-__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
+__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
-                                               uint *d_SrcKey, uint *d_SrcVal,
+                                               uint *d_DstVal,
-                                               uint *d_LimitsA, uint *d_LimitsB,
+                                               uint *d_SrcKey,
-                                               uint stride, uint N) {
+                                               uint *d_SrcVal,
                                               uint *d_LimitsA,
                                               uint *d_LimitsB,
                                               uint  stride,
                                               uint  N)
 {
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ uint  s_key[2 * SAMPLE_STRIDE];
@ -368,10 +357,8 @@ __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
        startSrcA    = d_LimitsA[blockIdx.x];
        startSrcB    = d_LimitsB[blockIdx.x];
-    uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
+        uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
-                                                    : segmentElementsA;
+        uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
    uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
                                                    : segmentElementsB;
        lenSrcA      = endSrcA - startSrcA;
        lenSrcB      = endSrcB - startSrcB;
        startDstA    = startSrcA + startSrcB;
@ -387,17 +374,23 @@ __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
    }
    if (threadIdx.x < lenSrcB) {
-    s_key[threadIdx.x + SAMPLE_STRIDE] =
+        s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
-        d_SrcKey[stride + startSrcB + threadIdx.x];
+        s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
    s_val[threadIdx.x + SAMPLE_STRIDE] =
        d_SrcVal[stride + startSrcB + threadIdx.x];
    }
    // Merge data in shared memory
    cg::sync(cta);
-  merge<sortDir>(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE,
+    merge<sortDir>(s_key,
-                 s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB,
+                   s_val,
-                 SAMPLE_STRIDE, cta);
+                   s_key + 0,
                   s_val + 0,
                   s_key + SAMPLE_STRIDE,
                   s_val + SAMPLE_STRIDE,
                   lenSrcA,
                   SAMPLE_STRIDE,
                   lenSrcB,
                   SAMPLE_STRIDE,
                   cta);
    // Store merged data
    cg::sync(cta);
@ -413,63 +406,77 @@ __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
    }
 }
-static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
+static void mergeElementaryIntervals(uint *d_DstKey,
-                                     uint *d_SrcKey, uint *d_SrcVal,
+                                     uint *d_DstVal,
-                                     uint *d_LimitsA, uint *d_LimitsB,
+                                     uint *d_SrcKey,
-                                     uint stride, uint N, uint sortDir) {
+                                     uint *d_SrcVal,
                                     uint *d_LimitsA,
                                     uint *d_LimitsB,
                                     uint  stride,
                                     uint  N,
                                     uint  sortDir)
 {
    uint lastSegmentElements = N % (2 * stride);
-  uint mergePairs = (lastSegmentElements > stride)
+    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
                        ? getSampleCount(N)
                        : (N - lastSegmentElements) / SAMPLE_STRIDE;
    if (sortDir) {
-    mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
+        mergeElementaryIntervalsKernel<1U>
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
+            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
        N);
        getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
-  } else {
+    }
-    mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
+    else {
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
+        mergeElementaryIntervalsKernel<0U>
-        N);
+            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
        getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
    }
 }
-extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
+extern "C" void bitonicSortShared(uint *d_DstKey,
-                                  uint *d_SrcKey, uint *d_SrcVal,
+                                  uint *d_DstVal,
-                                  uint batchSize, uint arrayLength,
+                                  uint *d_SrcKey,
                                  uint *d_SrcVal,
                                  uint  batchSize,
                                  uint  arrayLength,
                                  uint  sortDir);
-extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
+extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
-                                                uint *d_SrcKey, uint *d_SrcVal,
+                                                uint *d_DstVal,
                                                uint *d_SrcKey,
                                                uint *d_SrcVal,
                                                uint *d_LimitsA,
-                                                uint *d_LimitsB, uint stride,
+                                                uint *d_LimitsB,
-                                                uint N, uint sortDir);
+                                                uint  stride,
                                                uint  N,
                                                uint  sortDir);
 static uint      *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
 static const uint MAX_SAMPLE_COUNT = 32768;
-extern "C" void initMergeSort(void) {
+extern "C" void initMergeSort(void)
-  checkCudaErrors(
+{
-      cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
-  checkCudaErrors(
+    checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
-      cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
-  checkCudaErrors(
+    checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
      cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
  checkCudaErrors(
      cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
 }
-extern "C" void closeMergeSort(void) {
+extern "C" void closeMergeSort(void)
 {
    checkCudaErrors(cudaFree(d_RanksA));
    checkCudaErrors(cudaFree(d_RanksB));
    checkCudaErrors(cudaFree(d_LimitsB));
    checkCudaErrors(cudaFree(d_LimitsA));
 }
-extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
+extern "C" void mergeSort(uint *d_DstKey,
-                          uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal,
+                          uint *d_DstVal,
-                          uint N, uint sortDir) {
+                          uint *d_BufKey,
                          uint *d_BufVal,
                          uint *d_SrcKey,
                          uint *d_SrcVal,
                          uint  N,
                          uint  sortDir)
 {
    uint stageCount = 0;
    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
@ -482,7 +489,8 @@ extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
        ival = d_BufVal;
        okey = d_DstKey;
        oval = d_DstVal;
-  } else {
+    }
    else {
        ikey = d_DstKey;
        ival = d_DstVal;
        okey = d_BufKey;
@ -491,8 +499,7 @@ extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
    assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
    assert(N % SHARED_SIZE_LIMIT == 0);
-  mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT,
+    mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);
                  SHARED_SIZE_LIMIT, sortDir);
    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
        uint lastSegmentElements = N % (2 * stride);
@ -504,18 +511,19 @@ extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
        mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
        // Merge elementary intervals
-    mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB,
+        mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);
                             stride, N, sortDir);
        if (lastSegmentElements <= stride) {
            // Last merge segment consists of a single array which just needs to be
            // passed through
-      checkCudaErrors(cudaMemcpy(
+            checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements),
-          okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
+                                       ikey + (N - lastSegmentElements),
-          lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
+                                       lastSegmentElements * sizeof(uint),
-      checkCudaErrors(cudaMemcpy(
+                                       cudaMemcpyDeviceToDevice));
-          oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
+            checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements),
-          lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
+                                       ival + (N - lastSegmentElements),
                                       lastSegmentElements * sizeof(uint),
                                       cudaMemcpyDeviceToDevice));
        }
        uint *t;
--- a/Samples/0_Introduction/mergeSort/mergeSort_common.h
+++ b/Samples/0_Introduction/mergeSort/mergeSort_common.h
@ -36,14 +36,12 @@ typedef unsigned int uint;
 ////////////////////////////////////////////////////////////////////////////////
 // Extensive sort validation routine
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
+extern "C" uint
-                                   uint arrayLength, uint numValues,
+validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir);
                                   uint sortDir);
 extern "C" void fillValues(uint *val, uint N);
-extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
+extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength);
                                    uint batchSize, uint arrayLength);
 ////////////////////////////////////////////////////////////////////////////////
 // CUDA merge sort
@ -52,13 +50,11 @@ extern "C" void initMergeSort(void);
 extern "C" void closeMergeSort(void);
-extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey,
+extern "C" void
-                          uint *bufVal, uint *srcKey, uint *srcVal, uint N,
+mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
                          uint sortDir);
 ////////////////////////////////////////////////////////////////////////////////
 // CPU "emulation"
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
+extern "C" void
-                              uint *bufVal, uint *srcKey, uint *srcVal, uint N,
+mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
                              uint sortDir);
--- a/Samples/0_Introduction/mergeSort/mergeSort_host.cpp
+++ b/Samples/0_Introduction/mergeSort/mergeSort_host.cpp
@ -29,19 +29,20 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mergeSort_common.h"
 ////////////////////////////////////////////////////////////////////////////////
 // Helper functions
 ////////////////////////////////////////////////////////////////////////////////
-static void checkOrder(uint *data, uint N, uint sortDir) {
+static void checkOrder(uint *data, uint N, uint sortDir)
 {
    if (N <= 1) {
        return;
    }
    for (uint i = 0; i < N - 1; i++)
-    if ((sortDir && (data[i] > data[i + 1])) ||
+        if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) {
        (!sortDir && (data[i] < data[i + 1]))) {
            fprintf(stderr, "checkOrder() failed!!!\n");
            exit(EXIT_FAILURE);
        }
@ -49,12 +50,13 @@ static void checkOrder(uint *data, uint N, uint sortDir) {
 static uint umin(uint a, uint b) { return (a <= b) ? a : b; }
-static uint getSampleCount(uint dividend) {
+static uint getSampleCount(uint dividend)
-  return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1)
+{
-                                           : (dividend / SAMPLE_STRIDE);
+    return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE);
 }
-static uint nextPowerOfTwo(uint x) {
+static uint nextPowerOfTwo(uint x)
 {
    --x;
    x |= x >> 1;
    x |= x >> 2;
@ -64,7 +66,8 @@ static uint nextPowerOfTwo(uint x) {
    return ++x;
 }
-static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
+static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
 {
    if (L == 0) {
        return 0;
    }
@ -74,8 +77,7 @@ static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
    for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
        uint newPos = umin(pos + stride, L);
-    if ((sortDir && (data[newPos - 1] <= val)) ||
+        if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
        (!sortDir && (data[newPos - 1] >= val))) {
            pos = newPos;
        }
    }
@ -83,7 +85,8 @@ static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
    return pos;
 }
-static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
+static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
 {
    if (L == 0) {
        return 0;
    }
@ -93,8 +96,7 @@ static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
    for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
        uint newPos = umin(pos + stride, L);
-    if ((sortDir && (data[newPos - 1] < val)) ||
+        if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
        (!sortDir && (data[newPos - 1] > val))) {
            pos = newPos;
        }
    }
@ -105,12 +107,10 @@ static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 1: find sample ranks in each segment
 ////////////////////////////////////////////////////////////////////////////////
-static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
+static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir)
-                                uint stride, uint N, uint sortDir) {
+{
    uint lastSegmentElements = N % (2 * stride);
-  uint sampleCount =
+    uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
      (lastSegmentElements > stride)
          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
    for (uint pos = 0; pos < sampleCount; pos++) {
@ -124,17 +124,14 @@ static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
        if (i < nA) {
            ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
-      ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] =
+            ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive(
-          binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE],
+                srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir);
                                srcKey + segmentBase + stride, lenB, sortDir);
        }
        if (i < nB) {
            ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
-      ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] =
+            ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive(
-          binarySearchInclusive(
+                srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir);
              srcKey[segmentBase + stride + i * SAMPLE_STRIDE],
              srcKey + segmentBase, lenA, sortDir);
        }
    }
 }
@ -142,12 +139,10 @@ static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 2: merge ranks and indices to derive elementary intervals
 ////////////////////////////////////////////////////////////////////////////////
-static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
+static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
-                                 uint N) {
+{
    uint lastSegmentElements = N % (2 * stride);
-  uint sampleCount =
+    uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
      (lastSegmentElements > stride)
          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
    for (uint pos = 0; pos < sampleCount; pos++) {
@ -161,23 +156,20 @@ static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
        if (i < nA) {
            uint dstPosA =
-          binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i],
+                binarySearchExclusive(
-                                ranks + (segmentBase + stride) / SAMPLE_STRIDE,
+                    ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1)
-                                nB, 1) +
+                + i;
          i;
            assert(dstPosA < nA + nB);
-      limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
+            limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
          ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
        }
        if (i < nB) {
-      uint dstPosA = binarySearchInclusive(
+            uint dstPosA =
-                         ranks[(segmentBase + stride) / SAMPLE_STRIDE + i],
+                binarySearchInclusive(
-                         ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) +
+                    ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1)
-                     i;
+                + i;
            assert(dstPosA < nA + nB);
-      limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
+            limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
          ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
        }
    }
 }
@ -185,9 +177,16 @@ static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
 ////////////////////////////////////////////////////////////////////////////////
-static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
+static void merge(uint *dstKey,
-                  uint *srcBKey, uint *srcBVal, uint lenA, uint lenB,
+                  uint *dstVal,
-                  uint sortDir) {
+                  uint *srcAKey,
                  uint *srcAVal,
                  uint *srcBKey,
                  uint *srcBVal,
                  uint  lenA,
                  uint  lenB,
                  uint  sortDir)
 {
    checkOrder(srcAKey, lenA, sortDir);
    checkOrder(srcBKey, lenB, sortDir);
@ -206,13 +205,18 @@ static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
    }
 }
-static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
+static void mergeElementaryIntervals(uint *dstKey,
-                                     uint *srcVal, uint *limitsA, uint *limitsB,
+                                     uint *dstVal,
-                                     uint stride, uint N, uint sortDir) {
+                                     uint *srcKey,
                                     uint *srcVal,
                                     uint *limitsA,
                                     uint *limitsB,
                                     uint  stride,
                                     uint  N,
                                     uint  sortDir)
 {
    uint lastSegmentElements = N % (2 * stride);
-  uint mergePairs = (lastSegmentElements > stride)
+    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
                        ? getSampleCount(N)
                        : (N - lastSegmentElements) / SAMPLE_STRIDE;
    for (uint pos = 0; pos < mergePairs; pos++) {
        uint i           = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
@ -240,15 +244,18 @@ static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
              (srcKey + segmentBase + 0) + startPosA,
              (srcVal + segmentBase + 0) + startPosA,
              (srcKey + segmentBase + stride) + startPosB,
-          (srcVal + segmentBase + stride) + startPosB, endPosA - startPosA,
+              (srcVal + segmentBase + stride) + startPosB,
-          endPosB - startPosB, sortDir);
+              endPosA - startPosA,
              endPosB - startPosB,
              sortDir);
    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Retarded bubble sort
 ////////////////////////////////////////////////////////////////////////////////
-static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
+static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
 {
    if (N <= 1) {
        return;
    }
@ -278,9 +285,9 @@ static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
 ////////////////////////////////////////////////////////////////////////////////
 // Interface function
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
+extern "C" void
-                              uint *bufVal, uint *srcKey, uint *srcVal, uint N,
+mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir)
-                              uint sortDir) {
+{
    uint *ikey, *ival, *okey, *oval;
    uint  stageCount = 0;
@ -292,7 +299,8 @@ extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
        ival = bufVal;
        okey = dstKey;
        oval = dstVal;
-  } else {
+    }
    else {
        ikey = dstKey;
        ival = dstVal;
        okey = bufKey;
@ -304,8 +312,7 @@ extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
    memcpy(ival, srcVal, N * sizeof(uint));
    for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
-    bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos),
+        bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir);
               sortDir);
    }
    printf("Merge...\n");
@ -329,16 +336,15 @@ extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
        mergeRanksAndIndices(limitsB, ranksB, stride, N);
        // Merge elementary intervals
-    mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride,
+        mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir);
                             N, sortDir);
        if (lastSegmentElements <= stride) {
            // Last merge segment consists of a single array which just needs to be
            // passed through
-      memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
+            memcpy(
-             lastSegmentElements * sizeof(uint));
+                okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
-      memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
+            memcpy(
-             lastSegmentElements * sizeof(uint));
+                oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
        }
        uint *t;
--- a/Samples/0_Introduction/mergeSort/mergeSort_validate.cpp
+++ b/Samples/0_Introduction/mergeSort/mergeSort_validate.cpp
@ -29,14 +29,15 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mergeSort_common.h"
 ////////////////////////////////////////////////////////////////////////////////
 // Validate sorted keys array (check for integrity and proper order)
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
+extern "C" uint
-                                   uint arrayLength, uint numValues,
+validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir)
-                                   uint sortDir) {
+{
    uint *srcHist;
    uint *resHist;
@ -51,8 +52,7 @@ extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
    int flag = 1;
-  for (uint j = 0; j < batchSize;
+    for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) {
       j++, srcKey += arrayLength, resKey += arrayLength) {
        // Build histograms for keys arrays
        memset(srcHist, 0, numValues * sizeof(uint));
        memset(resHist, 0, numValues * sizeof(uint));
@ -61,11 +61,9 @@ extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
            if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
                srcHist[srcKey[i]]++;
                resHist[resKey[i]]++;
-      } else {
+            }
-        fprintf(
+            else {
-            stderr,
+                fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j);
            "***Set %u source/result key arrays are not limited properly***\n",
            j);
                flag = 0;
                goto brk;
            }
@ -74,18 +72,15 @@ extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
        // Compare the histograms
        for (uint i = 0; i < numValues; i++)
            if (srcHist[i] != resHist[i]) {
-        fprintf(stderr,
+                fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j);
                "***Set %u source/result keys histograms do not match***\n", j);
                flag = 0;
                goto brk;
            }
        // Finally check the ordering
        for (uint i = 0; i < arrayLength - 1; i++)
-      if ((sortDir && (resKey[i] > resKey[i + 1])) ||
+            if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) {
-          (!sortDir && (resKey[i] < resKey[i + 1]))) {
+                fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j);
        fprintf(stderr,
                "***Set %u result key array is not ordered properly***\n", j);
                flag = 0;
                goto brk;
            }
@ -95,7 +90,8 @@ brk:
    free(resHist);
    free(srcHist);
-  if (flag) printf("OK\n");
+    if (flag)
        printf("OK\n");
    return flag;
 }
@ -103,30 +99,30 @@ brk:
 ////////////////////////////////////////////////////////////////////////////////
 // Value validation / stability check routines
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void fillValues(uint *val, uint N) {
+extern "C" void fillValues(uint *val, uint N)
-  for (uint i = 0; i < N; i++) val[i] = i;
+{
    for (uint i = 0; i < N; i++)
        val[i] = i;
 }
-extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
+extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength)
-                                    uint batchSize, uint arrayLength) {
+{
    int correctFlag = 1, stableFlag = 1;
    printf("...inspecting keys and values array: ");
-  for (uint i = 0; i < batchSize;
+    for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) {
       i++, resKey += arrayLength, resVal += arrayLength) {
        for (uint j = 0; j < arrayLength; j++) {
-      if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0;
+            if (resKey[j] != srcKey[resVal[j]])
                correctFlag = 0;
-      if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) &&
+            if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1]))
          (resVal[j] > resVal[j + 1]))
                stableFlag = 0;
        }
    }
    printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
-  printf(stableFlag ? "...stability property: stable!\n"
+    printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n");
                    : "...stability property: NOT stable\n");
    return correctFlag;
 }
--- a/Samples/0_Introduction/simpleAWBarrier/simpleAWBarrier.cu
+++ b/Samples/0_Introduction/simpleAWBarrier/simpleAWBarrier.cu
@ -29,9 +29,9 @@
 #include <stdio.h>
 // Includes CUDA
 #include <cuda_runtime.h>
 #include <cuda/barrier>
 #include <cooperative_groups.h>
 #include <cuda/barrier>
 #include <cuda_runtime.h>
 // Utilities and timing functions
 #include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
@ -43,9 +43,11 @@ namespace cg = cooperative_groups;
 #if __CUDA_ARCH__ >= 700
 template <bool writeSquareRoot>
-__device__ void reduceBlockData(
+__device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
-    cuda::barrier<cuda::thread_scope_block> &barrier,
+                                cg::thread_block_tile<32>               &tile32,
-    cg::thread_block_tile<32> &tile32, double &threadSum, double *result) {
+                                double                                  &threadSum,
                                double                                  *result)
 {
    extern __shared__ double tmp[];
 #pragma unroll
@ -62,9 +64,7 @@ __device__ void reduceBlockData(
    // The warp 0 will perform last round of reduction
    if (tile32.meta_group_rank() == 0) {
-    double beta = tile32.thread_rank() < tile32.meta_group_size()
+        double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
                      ? tmp[tile32.thread_rank()]
                      : 0.0;
 #pragma unroll
        for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
@ -81,8 +81,8 @@ __device__ void reduceBlockData(
 }
 #endif
-__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
+__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
-                                             double *partialResults, int size) {
+{
 #if __CUDA_ARCH__ >= 700
 #pragma diag_suppress static_var_with_dynamic_init
    cg::thread_block cta  = cg::this_thread_block();
@ -105,8 +105,7 @@ __global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
    // Each thread block performs reduction of partial dotProducts and writes to
    // global mem.
-  reduceBlockData<false>(barrier, tile32, threadSum,
+    reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
                         &partialResults[blockIdx.x]);
    cg::sync(grid);
@ -137,15 +136,15 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("%s starting...\n", argv[0]);
    // This will pick the best possible CUDA capable device
    int dev = findCudaDevice(argc, (const char **)argv);
    int major = 0;
-  checkCudaErrors(
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
    // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
    if (major < 7) {
@ -154,12 +153,10 @@ int main(int argc, char **argv) {
    }
    int supportsCooperativeLaunch = 0;
-  checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch,
+    checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
                                         cudaDevAttrCooperativeLaunch, dev));
    if (!supportsCooperativeLaunch) {
-    printf(
+        printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
        "\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
               "Waiving the run\n",
               dev);
        exit(EXIT_WAIVED);
@ -171,7 +168,8 @@ int main(int argc, char **argv) {
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
+int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
 {
    float  *vecA, *d_vecA;
    float  *vecB, *d_vecB;
    double *d_partialResults;
@ -191,16 +189,14 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
    cudaStream_t stream;
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size,
+    checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
-                                  cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
  checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
                                  cudaMemcpyHostToDevice, stream));
    // Kernel configuration, where a one-dimensional
    // grid and one-dimensional blocks are configured.
    int minGridSize = 0, blockSize = 0;
-  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
+    checkCudaErrors(
-      &minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
+        cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
    int smemSize = ((blockSize / 32) + 1) * sizeof(double);
@ -209,28 +205,24 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
        &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
    int multiProcessorCount = 0;
-  checkCudaErrors(cudaDeviceGetAttribute(
+    checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
      &multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
    minGridSize = multiProcessorCount * numBlocksPerSm;
    checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
-  printf(
+    printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
      "Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
           "blockSize = %d\n",
-      minGridSize, blockSize);
+           minGridSize,
           blockSize);
    dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
-  void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB,
+    void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size};
                        (void *)&d_partialResults, (void *)&size};
-  checkCudaErrors(
+    checkCudaErrors(cudaLaunchCooperativeKernel(
-      cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid,
+        (void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
                                  dimBlock, kernelArgs, smemSize, stream));
-  checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size,
+    checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream));
                                  cudaMemcpyDeviceToHost, stream));
    checkCudaErrors(cudaStreamSynchronize(stream));
    float        expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
@ -239,7 +231,8 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
        if ((vecA[i] - expectedResult) > 0.00001) {
            printf("mismatch at i = %d\n", i);
            break;
-    } else {
+        }
        else {
            matches++;
        }
    }
--- a/Samples/0_Introduction/simpleAssert/simpleAssert.cu
+++ b/Samples/0_Introduction/simpleAssert/simpleAssert.cu
@ -34,8 +34,8 @@
 #endif
 // Includes, system
 #include <stdio.h>
 #include <cassert>
 #include <stdio.h>
 // Includes CUDA
 #include <cuda_runtime.h>
@ -58,7 +58,8 @@ bool testResult = true;
 //! Tests assert function.
 //! Thread whose id > N will print assertion failed error message.
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void testKernel(int N) {
+__global__ void testKernel(int N)
 {
    int gtid = blockIdx.x * blockDim.x + threadIdx.x;
    assert(gtid < N);
 }
@ -70,17 +71,18 @@ void runTest(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("%s starting...\n", sampleName);
    runTest(argc, argv);
-  printf("%s completed, returned %s\n", sampleName,
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
         testResult ? "OK" : "ERROR!");
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
 {
    int         Nblocks  = 2;
    int         Nthreads = 32;
    cudaError_t error;
@ -94,7 +96,8 @@ void runTest(int argc, char **argv) {
    if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
        printf("simpleAssert is not current supported on Mac OSX\n\n");
        exit(EXIT_SUCCESS);
-  } else {
+    }
    else {
        printf("OS Info: <%s>\n\n", OS_System_Type.version);
    }
@ -118,8 +121,7 @@ void runTest(int argc, char **argv) {
    // Check for errors and failed asserts in asynchronous kernel launch.
    if (error == cudaErrorAssert) {
-    printf(
+        printf("Device assert failed as expected, "
        "Device assert failed as expected, "
               "CUDA error message is: %s\n\n",
               cudaGetErrorString(error));
    }
--- a/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert.cpp
+++ b/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert.cpp
@ -34,11 +34,12 @@
 #endif
 // Includes, system
 #include <stdio.h>
 #include <cassert>
 #include <stdio.h>
 // Includes CUDA
 #include <cuda_runtime.h>
 #include "nvrtc_helper.h"
 // Utilities and timing functions
@ -58,7 +59,8 @@ void runTest(int argc, char **argv);
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("%s starting...\n", sampleName);
    runTest(argc, argv);
@ -66,7 +68,8 @@ int main(int argc, char **argv) {
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
 {
    int Nblocks  = 2;
    int Nthreads = 32;
@ -91,10 +94,15 @@ void runTest(int argc, char **argv) {
    int   count  = 60;
    void *args[] = {(void *)&count};
-  checkCudaErrors(cuLaunchKernel(
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
-      kernel_addr, dimGrid.x, dimGrid.y, dimGrid.z, /* grid dim */
+                                   dimGrid.x,
-      dimBlock.x, dimBlock.y, dimBlock.z,           /* block dim */
+                                   dimGrid.y,
-      0, 0,                                         /* shared mem, stream */
+                                   dimGrid.z, /* grid dim */
                                   dimBlock.x,
                                   dimBlock.y,
                                   dimBlock.z, /* block dim */
                                   0,
                                   0,        /* shared mem, stream */
                                   &args[0], /* arguments */
                                   0));
--- a/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert_kernel.cu
+++ b/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert_kernel.cu
@ -32,7 +32,8 @@
 //! Thread whose id > N will print assertion failed error message.
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ void testKernel(int N) {
+extern "C" __global__ void testKernel(int N)
 {
    int gtid = blockIdx.x * blockDim.x + threadIdx.x;
    assert(gtid < N);
 }
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics.cu
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics.cu
@ -30,10 +30,10 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -68,20 +68,21 @@ extern "C" bool computeGold(int *gpuData, const int len);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("%s starting...\n", sampleName);
    runTest(argc, argv);
-  printf("%s completed, returned %s\n", sampleName,
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
         testResult ? "OK" : "ERROR!");
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
 {
    cudaStream_t stream;
    // This will pick the best possible CUDA capable device
    findCudaDevice(argc, (const char **)argv);
@ -100,7 +101,8 @@ void runTest(int argc, char **argv) {
    checkCudaErrors(cudaMallocHost(&hOData, memSize));
    // initialize the memory
-  for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
+    for (unsigned int i = 0; i < numData; i++)
        hOData[i] = 0;
    // To make the AND and XOR tests generate something other than 0...
    hOData[8] = hOData[10] = 0xff;
@ -110,15 +112,13 @@ void runTest(int argc, char **argv) {
    int *dOData;
    checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
    // copy host memory to device to initialize to zero
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
      cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
    // execute the kernel
    testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
    // Copy result from device to host
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
      cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
    checkCudaErrors(cudaStreamSynchronize(stream));
    sdkStopTimer(&timer);
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_cpu.cpp
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_cpu.cpp
@ -42,7 +42,8 @@ extern "C" int computeGold(int *gpuData, const int len);
 //! @param idata      input data as provided to device
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
-int computeGold(int *gpuData, const int len) {
+int computeGold(int *gpuData, const int len)
 {
    int val = 0;
    for (int i = 0; i < len; ++i) {
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_kernel.cuh
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_kernel.cuh
@ -35,7 +35,8 @@
 //! @param g_idata  input data in global memory
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void testKernel(int *g_odata) {
+__global__ void testKernel(int *g_odata)
 {
    // access thread id
    const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics.cpp
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics.cpp
@ -30,10 +30,10 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -64,13 +64,13 @@ extern "C" bool computeGold(int *gpuData, const int len);
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("%s starting...\n", sampleName);
    runTest(argc, argv);
-  printf("%s completed, returned %s\n", sampleName,
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
         testResult ? "OK" : "ERROR!");
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
@ -79,7 +79,8 @@ int main(int argc, char **argv) {
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
 {
    int dev = 0;
    char  *cubin, *kernel_file;
@ -106,7 +107,8 @@ void runTest(int argc, char **argv) {
    int *hOData = (int *)malloc(memSize);
    // initialize the memory
-  for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
+    for (unsigned int i = 0; i < numData; i++)
        hOData[i] = 0;
    // To make the AND and XOR tests generate something other than 0...
    hOData[8] = hOData[10] = 0xff;
@ -121,11 +123,15 @@ void runTest(int argc, char **argv) {
    dim3 cudaGridSize(numBlocks, 1, 1);
    void *arr[] = {(void *)&dOData};
-  checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
                                   cudaGridSize.x,
                                   cudaGridSize.y,
                                   cudaGridSize.z, /* grid dim */
-                                 cudaBlockSize.x, cudaBlockSize.y,
+                                   cudaBlockSize.x,
                                   cudaBlockSize.y,
                                   cudaBlockSize.z, /* block dim */
-                                 0, 0,            /* shared mem, stream */
+                                   0,
                                   0,       /* shared mem, stream */
                                   &arr[0], /* arguments */
                                   0));
--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_cpu.cpp
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_cpu.cpp
@ -43,7 +43,8 @@ extern "C" int computeGold(int *gpuData, const int len);
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
-int computeGold(int *gpuData, const int len) {
+int computeGold(int *gpuData, const int len)
 {
    int val = 0;
    for (int i = 0; i < len; ++i) {
--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_kernel.cuh
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_kernel.cuh
@ -36,7 +36,8 @@
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ void testKernel(int *g_odata) {
+extern "C" __global__ void testKernel(int *g_odata)
 {
    // access thread id
    const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
--- a/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu
+++ b/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu
@ -26,10 +26,10 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes CUDA
 #include <cuda_runtime.h>
@ -42,7 +42,8 @@
 // declaration, forward
 void runTest(int argc, char **argv);
-cudaAccessPolicyWindow initAccessPolicyWindow(void) {
+cudaAccessPolicyWindow initAccessPolicyWindow(void)
 {
    cudaAccessPolicyWindow accessPolicyWindow = {0};
    accessPolicyWindow.base_ptr               = (void *)0;
    accessPolicyWindow.num_bytes              = 0;
@ -60,8 +61,8 @@ cudaAccessPolicyWindow initAccessPolicyWindow(void) {
 //! @param bigDataSize  input bigData size
 //! @param hitcount how many data access are done within block
 ////////////////////////////////////////////////////////////////////////////////
-static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
+static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, int bigDataSize, int hitCount)
-                                            int bigDataSize, int hitCount) {
+{
    __shared__ unsigned int hit;
    int                     row    = blockIdx.y * blockDim.y + threadIdx.y;
    int                     col    = blockIdx.x * blockDim.x + threadIdx.x;
@ -82,9 +83,9 @@ static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
        if ((tID % 2) == 0) {
            data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
-    } else {
+        }
-      trash[psRand % bigDataSize] =
+        else {
-          trash[psRand % bigDataSize] + trash[idx % bigDataSize];
+            trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize];
        }
        atomicAdd(&hit, 1);
@ -98,7 +99,8 @@ int main(int argc, char **argv) { runTest(argc, argv); }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
 {
    bool                   bTestResult = true;
    cudaAccessPolicyWindow accessPolicyWindow;
    cudaDeviceProp         deviceProp;
@ -127,8 +129,7 @@ void runTest(int argc, char **argv) {
    // Make sure device the l2 optimization
    if (deviceProp.persistingL2CacheMaxSize == 0) {
-    printf(
+        printf("Waiving execution as device %d does not support persisting L2 "
        "Waiving execution as device %d does not support persisting L2 "
               "Caching\n",
               devID);
        exit(EXIT_WAIVED);
@ -139,8 +140,7 @@ void runTest(int argc, char **argv) {
    // Set the amount of l2 cache that will be persisting to maximum the device
    // can support
-  checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
+    checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize));
                                     deviceProp.persistingL2CacheMaxSize));
    // Stream attribute to set
    streamAttrID = cudaStreamAttributeAccessPolicyWindow;
@ -155,8 +155,7 @@ void runTest(int argc, char **argv) {
    // Allocate data
    checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
-  checkCudaErrors(
+    checkCudaErrors(cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
      cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
    for (int i = 0; i < bigDataSize; ++i) {
        if (i < dataSize) {
@ -166,16 +165,12 @@ void runTest(int argc, char **argv) {
        bigDataHostPointer[bigDataSize - i - 1] = i;
    }
    checkCudaErrors(cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
    checkCudaErrors(
-      cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
+        cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpyAsync(
-      cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
+        bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
  checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
                                  dataSize * sizeof(int),
                                  cudaMemcpyHostToDevice, stream));
  checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
                                  bigDataSize * sizeof(int),
                                  cudaMemcpyHostToDevice, stream));
    // Make a window for the buffer of interest
    accessPolicyWindow.base_ptr        = (void *)dataDevicePointer;
@ -186,8 +181,7 @@ void runTest(int argc, char **argv) {
    streamAttrValue.accessPolicyWindow = accessPolicyWindow;
    // Assign window to stream
-  checkCudaErrors(
+    checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
      cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
    // Demote any previous persisting lines
    checkCudaErrors(cudaCtxResetPersistingL2Cache());
--- a/Samples/0_Introduction/simpleCUDA2GL/README.md
+++ b/Samples/0_Introduction/simpleCUDA2GL/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## References (for more details)
--- a/Samples/0_Introduction/simpleCUDA2GL/main.cpp
+++ b/Samples/0_Introduction/simpleCUDA2GL/main.cpp
@ -50,8 +50,8 @@
 #endif
 // CUDA includes
 #include <cuda_runtime.h>
 #include <cuda_gl_interop.h>
 #include <cuda_runtime.h>
 // CUDA utilities and system includes
 #include <helper_cuda.h>
@ -124,8 +124,7 @@ StopWatchInterface *timer = NULL;
 GLuint shDraw;
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
+extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw);
                                   unsigned int *g_odata, int imgw);
 // Forward declarations
 void runStdProgram(int argc, char **argv);
@ -140,8 +139,7 @@ void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource);
 void deletePBO(GLuint *pbo);
 #endif
-void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x,
+void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int size_y);
                      unsigned int size_y);
 void deleteTexture(GLuint *tex);
 // rendering callbacks
@ -155,7 +153,8 @@ void mainMenu(int i);
 ////////////////////////////////////////////////////////////////////////////////
 //! Create PBO
 ////////////////////////////////////////////////////////////////////////////////
-void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource) {
+void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource)
 {
    // set up vertex data parameter
    num_texels    = image_width * image_height;
    num_values    = num_texels * 4;
@ -171,33 +170,32 @@ void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource) {
    glBindBuffer(GL_ARRAY_BUFFER, 0);
    // register this buffer object with CUDA
-  checkCudaErrors(cudaGraphicsGLRegisterBuffer(pbo_resource, *pbo,
+    checkCudaErrors(cudaGraphicsGLRegisterBuffer(pbo_resource, *pbo, cudaGraphicsMapFlagsNone));
                                               cudaGraphicsMapFlagsNone));
    SDK_CHECK_ERROR_GL();
 }
-void deletePBO(GLuint *pbo) {
+void deletePBO(GLuint *pbo)
 {
    glDeleteBuffers(1, pbo);
    SDK_CHECK_ERROR_GL();
    *pbo = 0;
 }
 #endif
-const GLenum fbo_targets[] = {
+const GLenum fbo_targets[] = {GL_COLOR_ATTACHMENT0_EXT,
-    GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT,
+                              GL_COLOR_ATTACHMENT1_EXT,
-    GL_COLOR_ATTACHMENT2_EXT, GL_COLOR_ATTACHMENT3_EXT};
+                              GL_COLOR_ATTACHMENT2_EXT,
                              GL_COLOR_ATTACHMENT3_EXT};
 #ifndef USE_TEXSUBIMAGE2D
-static const char *glsl_drawtex_vertshader_src =
+static const char *glsl_drawtex_vertshader_src = "void main(void)\n"
    "void main(void)\n"
                                                 "{\n"
                                                 "	gl_Position = gl_Vertex;\n"
                                                 "	gl_TexCoord[0].xy = gl_MultiTexCoord0.xy;\n"
                                                 "}\n";
-static const char *glsl_drawtex_fragshader_src =
+static const char *glsl_drawtex_fragshader_src = "#version 130\n"
    "#version 130\n"
                                                 "uniform usampler2D texImage;\n"
                                                 "void main()\n"
                                                 "{\n"
@ -227,15 +225,15 @@ static const char *glsl_draw_fragshader_src =
 #endif
 // copy image and process using CUDA
-void generateCUDAImage() {
+void generateCUDAImage()
 {
    // run the Cuda kernel
    unsigned int *out_data;
 #ifdef USE_TEXSUBIMAGE2D
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_dest_resource, 0));
    size_t num_bytes;
-  checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
+    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&out_data, &num_bytes, cuda_pbo_dest_resource));
      (void **)&out_data, &num_bytes, cuda_pbo_dest_resource));
 // printf("CUDA mapped pointer of pbo_out: May access %ld bytes, expected %d\n",
 // num_bytes, size_tex_data);
 #else
@ -258,8 +256,7 @@ void generateCUDAImage() {
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest);
    glBindTexture(GL_TEXTURE_2D, tex_cudaResult);
-  glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_RGBA,
+    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
                  GL_UNSIGNED_BYTE, NULL);
    SDK_CHECK_ERROR_GL();
    glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
@ -268,21 +265,20 @@ void generateCUDAImage() {
    // map buffer objects to get CUDA device pointers
    cudaArray *texture_ptr;
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_tex_result_resource, 0));
-  checkCudaErrors(cudaGraphicsSubResourceGetMappedArray(
+    checkCudaErrors(cudaGraphicsSubResourceGetMappedArray(&texture_ptr, cuda_tex_result_resource, 0, 0));
      &texture_ptr, cuda_tex_result_resource, 0, 0));
    int num_texels    = image_width * image_height;
    int num_values    = num_texels * 4;
    int size_tex_data = sizeof(GLubyte) * num_values;
-  checkCudaErrors(cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource,
+    checkCudaErrors(cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource, size_tex_data, cudaMemcpyDeviceToDevice));
                                    size_tex_data, cudaMemcpyDeviceToDevice));
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_tex_result_resource, 0));
 #endif
 }
 // display image to the screen as textured quad
-void displayImage(GLuint texture) {
+void displayImage(GLuint texture)
 {
    glBindTexture(GL_TEXTURE_2D, texture);
    glEnable(GL_TEXTURE_2D);
    glDisable(GL_DEPTH_TEST);
@ -332,7 +328,8 @@ void displayImage(GLuint texture) {
 ////////////////////////////////////////////////////////////////////////////////
 //! Display callback
 ////////////////////////////////////////////////////////////////////////////////
-void display() {
+void display()
 {
    sdkStartTimer(&timer);
    if (enable_cuda) {
@ -358,9 +355,7 @@ void display() {
            sprintf(currentOutputPPM, "kilt.ppm");
            g_CheckRender->savePPM(currentOutputPPM, true, NULL);
-      if (!g_CheckRender->PPMvsPPM(currentOutputPPM,
+            if (!g_CheckRender->PPMvsPPM(currentOutputPPM, sdkFindFilePath(ref_file, pArgv[0]), MAX_EPSILON, 0.30f)) {
                                   sdkFindFilePath(ref_file, pArgv[0]),
                                   MAX_EPSILON, 0.30f)) {
                g_TotalErrors++;
            }
@ -374,8 +369,7 @@ void display() {
    if (++fpsCount == fpsLimit) {
        char  cTitle[256];
        float fps = 1000.0f / sdkGetAverageTimerValue(&timer);
-    sprintf(cTitle, "CUDA GL Post Processing (%d x %d): %.1f fps", window_width,
+        sprintf(cTitle, "CUDA GL Post Processing (%d x %d): %.1f fps", window_width, window_height, fps);
            window_height, fps);
        glutSetWindowTitle(cTitle);
        // printf("%s\n", cTitle);
        fpsCount = 0;
@ -384,7 +378,8 @@ void display() {
    }
 }
-void timerEvent(int value) {
+void timerEvent(int value)
 {
    glutPostRedisplay();
    glutTimerFunc(REFRESH_DELAY, timerEvent, 0);
 }
@ -392,7 +387,8 @@ void timerEvent(int value) {
 ////////////////////////////////////////////////////////////////////////////////
 //! Keyboard events handler
 ////////////////////////////////////////////////////////////////////////////////
-void keyboard(unsigned char key, int /*x*/, int /*y*/) {
+void keyboard(unsigned char key, int /*x*/, int /*y*/)
 {
    switch (key) {
    case (27):
        Cleanup(EXIT_SUCCESS);
@ -404,7 +400,8 @@ void keyboard(unsigned char key, int /*x*/, int /*y*/) {
        if (enable_cuda) {
            glClearColorIuiEXT(128, 128, 128, 255);
-      } else {
+        }
        else {
            glClearColor(0.5, 0.5, 0.5, 1.0);
        }
@ -413,7 +410,8 @@ void keyboard(unsigned char key, int /*x*/, int /*y*/) {
    }
 }
-void reshape(int w, int h) {
+void reshape(int w, int h)
 {
    window_width  = w;
    window_height = h;
 }
@ -423,8 +421,8 @@ void mainMenu(int i) { keyboard((unsigned char)i, 0, 0); }
 ////////////////////////////////////////////////////////////////////////////////
 //!
 ////////////////////////////////////////////////////////////////////////////////
-void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x,
+void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int size_y)
-                      unsigned int size_y) {
+{
    // create a texture
    glGenTextures(1, tex_cudaResult);
    glBindTexture(GL_TEXTURE_2D, *tex_cudaResult);
@ -436,24 +434,22 @@ void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x,
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
 #ifdef USE_TEXSUBIMAGE2D
-  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, size_x, size_y, 0, GL_RGBA,
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, size_x, size_y, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
               GL_UNSIGNED_BYTE, NULL);
    SDK_CHECK_ERROR_GL();
 #else
-  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0,
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0, GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL);
               GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL);
    SDK_CHECK_ERROR_GL();
    // register this texture with CUDA
    checkCudaErrors(cudaGraphicsGLRegisterImage(
-      &cuda_tex_result_resource, *tex_cudaResult, GL_TEXTURE_2D,
+        &cuda_tex_result_resource, *tex_cudaResult, GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard));
      cudaGraphicsMapFlagsWriteDiscard));
 #endif
 }
 ////////////////////////////////////////////////////////////////////////////////
 //!
 ////////////////////////////////////////////////////////////////////////////////
-void deleteTexture(GLuint *tex) {
+void deleteTexture(GLuint *tex)
 {
    glDeleteTextures(1, tex);
    SDK_CHECK_ERROR_GL();
@ -463,7 +459,8 @@ void deleteTexture(GLuint *tex) {
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
 #if defined(__linux__)
    char *Xstatus = getenv("DISPLAY");
    if (Xstatus == NULL) {
@ -487,8 +484,7 @@ int main(int argc, char **argv) {
    if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
        printf("[%s]\n", argv[0]);
        printf("   Does not explicitly support -device=n\n");
-    printf(
+        printf("   This sample requires OpenGL.  Only -file=<reference> are "
        "   This sample requires OpenGL.  Only -file=<reference> are "
               "supported\n");
        printf("exiting...\n");
        exit(EXIT_WAIVED);
@ -497,7 +493,8 @@ int main(int argc, char **argv) {
    if (ref_file) {
        printf("(Test with OpenGL verification)\n");
        runStdProgram(argc, argv);
-  } else {
+    }
    else {
        printf("(Interactive OpenGL Demo)\n");
        runStdProgram(argc, argv);
    }
@ -508,7 +505,8 @@ int main(int argc, char **argv) {
 ////////////////////////////////////////////////////////////////////////////////
 //!
 ////////////////////////////////////////////////////////////////////////////////
-void FreeResource() {
+void FreeResource()
 {
    sdkDeleteTimer(&timer);
 // unregister this buffer object with CUDA
@ -530,18 +528,18 @@ void FreeResource() {
    printf("simpleCUDA2GL Exiting...\n");
 }
-void Cleanup(int iExitCode) {
+void Cleanup(int iExitCode)
 {
    FreeResource();
-  printf("PPM Images are %s\n",
+    printf("PPM Images are %s\n", (iExitCode == EXIT_SUCCESS) ? "Matching" : "Not Matching");
         (iExitCode == EXIT_SUCCESS) ? "Matching" : "Not Matching");
    exit(iExitCode);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //!
 ////////////////////////////////////////////////////////////////////////////////
-GLuint compileGLSLprogram(const char *vertex_shader_src,
+GLuint compileGLSLprogram(const char *vertex_shader_src, const char *fragment_shader_src)
-                          const char *fragment_shader_src) {
+{
    GLuint v, f, p = 0;
    p = glCreateProgram();
@ -563,7 +561,8 @@ GLuint compileGLSLprogram(const char *vertex_shader_src,
            // #endif
            glDeleteShader(v);
            return 0;
-    } else {
+        }
        else {
            glAttachShader(p, v);
        }
    }
@ -585,7 +584,8 @@ GLuint compileGLSLprogram(const char *vertex_shader_src,
            // #endif
            glDeleteShader(f);
            return 0;
-    } else {
+        }
        else {
            glAttachShader(p, f);
        }
    }
@ -611,7 +611,8 @@ GLuint compileGLSLprogram(const char *vertex_shader_src,
 //! Allocate the "render target" of CUDA
 ////////////////////////////////////////////////////////////////////////////////
 #ifndef USE_TEXSUBIMAGE2D
-void initCUDABuffers() {
+void initCUDABuffers()
 {
    // set up vertex data parameter
    num_texels    = image_width * image_height;
    num_values    = num_texels * 4;
@ -625,7 +626,8 @@ void initCUDABuffers() {
 ////////////////////////////////////////////////////////////////////////////////
 //!
 ////////////////////////////////////////////////////////////////////////////////
-void initGLBuffers() {
+void initGLBuffers()
 {
 // create pbo
 #ifdef USE_TEXSUBIMAGE2D
    createPBO(&pbo_dest, &cuda_pbo_dest_resource);
@ -636,8 +638,7 @@ void initGLBuffers() {
    shDraw = compileGLSLprogram(NULL, glsl_draw_fragshader_src);
 #ifndef USE_TEXSUBIMAGE2D
-  shDrawTex = compileGLSLprogram(glsl_drawtex_vertshader_src,
+    shDrawTex = compileGLSLprogram(glsl_drawtex_vertshader_src, glsl_drawtex_fragshader_src);
                                 glsl_drawtex_fragshader_src);
 #endif
    SDK_CHECK_ERROR_GL();
 }
@ -645,7 +646,8 @@ void initGLBuffers() {
 ////////////////////////////////////////////////////////////////////////////////
 //! Run standard demo loop with or without GL verification
 ////////////////////////////////////////////////////////////////////////////////
-void runStdProgram(int argc, char **argv) {
+void runStdProgram(int argc, char **argv)
 {
    // First initialize OpenGL context, so we can properly set the GL for CUDA.
    // This is necessary in order to achieve optimal performance with OpenGL/CUDA
    // interop.
@ -683,8 +685,7 @@ void runStdProgram(int argc, char **argv) {
        g_CheckRender->EnableQAReadback(true);
    }
-  printf(
+    printf("\n"
      "\n"
           "\tControls\n"
           "\t(right click mouse button for Menu)\n"
           "\t[esc] - Quit\n\n");
@ -699,7 +700,8 @@ void runStdProgram(int argc, char **argv) {
 ////////////////////////////////////////////////////////////////////////////////
 //! Initialize GL
 ////////////////////////////////////////////////////////////////////////////////
-bool initGL(int *argc, char **argv) {
+bool initGL(int *argc, char **argv)
 {
    // Create GL context
    glutInit(argc, argv);
    glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH);
@ -707,8 +709,8 @@ bool initGL(int *argc, char **argv) {
    iGLUTWindowHandle = glutCreateWindow("CUDA OpenGL post-processing");
    // initialize necessary OpenGL extensions
-  if (!isGLVersionSupported(2, 0) ||
+    if (!isGLVersionSupported(2, 0)
-      !areGLExtensionsSupported("GL_ARB_pixel_buffer_object "
+        || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object "
                                     "GL_EXT_framebuffer_object")) {
        printf("ERROR: Support for necessary OpenGL extensions missing.");
        fflush(stderr);
@ -729,8 +731,7 @@ bool initGL(int *argc, char **argv) {
    // projection
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
-  gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1f,
+    gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1f, 10.0f);
                 10.0f);
    glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
--- a/Samples/0_Introduction/simpleCUDA2GL/simpleCUDA2GL.cu
+++ b/Samples/0_Introduction/simpleCUDA2GL/simpleCUDA2GL.cu
@ -35,14 +35,16 @@ __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }
 __device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }
 // convert floating point rgb color to 8-bit integer
-__device__ int rgbToInt(float r, float g, float b) {
+__device__ int rgbToInt(float r, float g, float b)
 {
    r = clamp(r, 0.0f, 255.0f);
    g = clamp(g, 0.0f, 255.0f);
    b = clamp(b, 0.0f, 255.0f);
    return (int(b) << 16) | (int(g) << 8) | int(r);
 }
-__global__ void cudaProcess(unsigned int *g_odata, int imgw) {
+__global__ void cudaProcess(unsigned int *g_odata, int imgw)
 {
    extern __shared__ uchar4 sdata[];
    int tx = threadIdx.x;
@ -56,7 +58,7 @@ __global__ void cudaProcess(unsigned int *g_odata, int imgw) {
    g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
 }
-extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
+extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw)
-                                   unsigned int *g_odata, int imgw) {
+{
    cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
 }
--- a/Samples/0_Introduction/simpleCallback/multithreading.cpp
+++ b/Samples/0_Introduction/simpleCallback/multithreading.cpp
@ -29,18 +29,21 @@
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 // Create thread
-CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
+CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
 {
    return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
 }
 // Wait for thread to finish
-void cutEndThread(CUTThread thread) {
+void cutEndThread(CUTThread thread)
 {
    WaitForSingleObject(thread, INFINITE);
    CloseHandle(thread);
 }
 // Wait for multiple threads
-void cutWaitForThreads(const CUTThread *threads, int num) {
+void cutWaitForThreads(const CUTThread *threads, int num)
 {
    WaitForMultipleObjects(num, threads, true, INFINITE);
    for (int i = 0; i < num; i++) {
@ -49,7 +52,8 @@ void cutWaitForThreads(const CUTThread *threads, int num) {
 }
 // Create barrier.
-CUTBarrier cutCreateBarrier(int releaseCount) {
+CUTBarrier cutCreateBarrier(int releaseCount)
 {
    CUTBarrier barrier;
    InitializeCriticalSection(&barrier.criticalSection);
@ -61,7 +65,8 @@ CUTBarrier cutCreateBarrier(int releaseCount) {
 }
 // Increment barrier. (execution continues)
-void cutIncrementBarrier(CUTBarrier *barrier) {
+void cutIncrementBarrier(CUTBarrier *barrier)
 {
    int myBarrierCount;
    EnterCriticalSection(&barrier->criticalSection);
    myBarrierCount = ++barrier->count;
@ -73,16 +78,15 @@ void cutIncrementBarrier(CUTBarrier *barrier) {
 }
 // Wait for barrier release.
-void cutWaitForBarrier(CUTBarrier *barrier) {
+void cutWaitForBarrier(CUTBarrier *barrier) { WaitForSingleObject(barrier->barrierEvent, INFINITE); }
  WaitForSingleObject(barrier->barrierEvent, INFINITE);
 }
 // Destroy barrier
 void cutDestroyBarrier(CUTBarrier *barrier) {}
 #else
 // Create thread
-CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
+CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
 {
    pthread_t thread;
    pthread_create(&thread, NULL, func, data);
    return thread;
@ -92,14 +96,16 @@ CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
 void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }
 // Wait for multiple threads
-void cutWaitForThreads(const CUTThread *threads, int num) {
+void cutWaitForThreads(const CUTThread *threads, int num)
 {
    for (int i = 0; i < num; i++) {
        cutEndThread(threads[i]);
    }
 }
 // Create barrier.
-CUTBarrier cutCreateBarrier(int releaseCount) {
+CUTBarrier cutCreateBarrier(int releaseCount)
 {
    CUTBarrier barrier;
    barrier.count        = 0;
@ -112,7 +118,8 @@ CUTBarrier cutCreateBarrier(int releaseCount) {
 }
 // Increment barrier. (execution continues)
-void cutIncrementBarrier(CUTBarrier *barrier) {
+void cutIncrementBarrier(CUTBarrier *barrier)
 {
    int myBarrierCount;
    pthread_mutex_lock(&barrier->mutex);
    myBarrierCount = ++barrier->count;
@ -124,7 +131,8 @@ void cutIncrementBarrier(CUTBarrier *barrier) {
 }
 // Wait for barrier release.
-void cutWaitForBarrier(CUTBarrier *barrier) {
+void cutWaitForBarrier(CUTBarrier *barrier)
 {
    pthread_mutex_lock(&barrier->mutex);
    while (barrier->count < barrier->releaseCount) {
@ -135,7 +143,8 @@ void cutWaitForBarrier(CUTBarrier *barrier) {
 }
 // Destroy barrier
-void cutDestroyBarrier(CUTBarrier *barrier) {
+void cutDestroyBarrier(CUTBarrier *barrier)
 {
    pthread_mutex_destroy(&barrier->mutex);
    pthread_cond_destroy(&barrier->conditionVariable);
 }
--- a/Samples/0_Introduction/simpleCallback/multithreading.h
+++ b/Samples/0_Introduction/simpleCallback/multithreading.h
@ -37,7 +37,8 @@
 typedef HANDLE CUTThread;
 typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);
-struct CUTBarrier {
+struct CUTBarrier
 {
    CRITICAL_SECTION criticalSection;
    HANDLE           barrierEvent;
    int              releaseCount;
@ -57,7 +58,8 @@ typedef void *(*CUT_THREADROUTINE)(void *);
 #define CUT_THREADPROC void *
 #define CUT_THREADEND  return 0
-struct CUTBarrier {
+struct CUTBarrier
 {
    pthread_mutex_t mutex;
    pthread_cond_t  conditionVariable;
    int             releaseCount;
@ -67,7 +69,8 @@ struct CUTBarrier {
 #endif
 #ifdef __cplusplus
-extern "C" {
+extern "C"
 {
 #endif
    // Create thread.
--- a/Samples/0_Introduction/simpleCallback/simpleCallback.cu
+++ b/Samples/0_Introduction/simpleCallback/simpleCallback.cu
@ -43,8 +43,8 @@
 #include <stdio.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #include "multithreading.h"
@ -53,10 +53,10 @@ const int N_elements_per_workload = 100000;
 CUTBarrier thread_barrier;
-void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status,
+void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void *data);
                                void *data);
-struct heterogeneous_workload {
+struct heterogeneous_workload
 {
    int id;
    int cudaDeviceID;
@ -67,13 +67,16 @@ struct heterogeneous_workload {
    bool success;
 };
-__global__ void incKernel(int *data, int N) {
+__global__ void incKernel(int *data, int N)
 {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < N) data[i]++;
+    if (i < N)
        data[i]++;
 }
-CUT_THREADPROC launch(void *void_arg) {
+CUT_THREADPROC launch(void *void_arg)
 {
    heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
    // Select GPU for this CPU thread
@ -81,11 +84,8 @@ CUT_THREADPROC launch(void *void_arg) {
    // Allocate Resources
    checkCudaErrors(cudaStreamCreate(&workload->stream));
-  checkCudaErrors(
+    checkCudaErrors(cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
-      cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
+    checkCudaErrors(cudaHostAlloc(&workload->h_data, N_elements_per_workload * sizeof(int), cudaHostAllocPortable));
  checkCudaErrors(cudaHostAlloc(&workload->h_data,
                                N_elements_per_workload * sizeof(int),
                                cudaHostAllocPortable));
    // CPU thread generates data
    for (int i = 0; i < N_elements_per_workload; ++i) {
@ -97,25 +97,28 @@ CUT_THREADPROC launch(void *void_arg) {
    dim3 block(512);
    dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
-  checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data,
+    checkCudaErrors(cudaMemcpyAsync(workload->d_data,
                                    workload->h_data,
                                    N_elements_per_workload * sizeof(int),
-                                  cudaMemcpyHostToDevice, workload->stream));
+                                    cudaMemcpyHostToDevice,
-  incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data,
+                                    workload->stream));
-                                                  N_elements_per_workload);
+    incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data, N_elements_per_workload);
-  checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data,
+    checkCudaErrors(cudaMemcpyAsync(workload->h_data,
                                    workload->d_data,
                                    N_elements_per_workload * sizeof(int),
-                                  cudaMemcpyDeviceToHost, workload->stream));
+                                    cudaMemcpyDeviceToHost,
                                    workload->stream));
    // New in CUDA 5.0: Add a CPU callback which is called once all currently
    // pending operations in the CUDA stream have finished
-  checkCudaErrors(
+    checkCudaErrors(cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
      cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
    CUT_THREADEND;
    // CPU thread end of life, GPU continues to process data...
 }
-CUT_THREADPROC postprocess(void *void_arg) {
+CUT_THREADPROC postprocess(void *void_arg)
 {
    heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
    // ... GPU is done with processing, continue on new CPU thread...
@ -140,8 +143,8 @@ CUT_THREADPROC postprocess(void *void_arg) {
    CUT_THREADEND;
 }
-void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
+void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data)
-                                void *data) {
+{
    // Check status of GPU after stream operations are done
    checkCudaErrors(status);
@ -149,7 +152,8 @@ void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
    cutStartThread(postprocess, data);
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    int N_gpus, max_gpus = 0;
    int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
@ -168,10 +172,8 @@ int main(int argc, char **argv) {
        cudaSetDevice(devid);
        cudaGetDeviceProperties(&deviceProp, devid);
        SMversion = deviceProp.major << 4 + deviceProp.minor;
-    printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name,
+        printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, deviceProp.major, deviceProp.minor);
-           deviceProp.major, deviceProp.minor);
+        printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");
    printf(", %s GPU Callback Functions\n",
           (SMversion >= 0x11) ? "capable" : "NOT capable");
        if (SMversion >= 0x11) {
            gpuInfo[max_gpus++] = devid;
@ -181,8 +183,7 @@ int main(int argc, char **argv) {
    printf("%d GPUs available to run Callback Functions\n", max_gpus);
    heterogeneous_workload *workloads;
-  workloads = (heterogeneous_workload *)malloc(N_workloads *
+    workloads = (heterogeneous_workload *)malloc(N_workloads * sizeof(heterogeneous_workload));
                                               sizeof(heterogeneous_workload));
    ;
    thread_barrier = cutCreateBarrier(N_workloads);
--- a/Samples/0_Introduction/simpleCooperativeGroups/simpleCooperativeGroups.cu
+++ b/Samples/0_Introduction/simpleCooperativeGroups/simpleCooperativeGroups.cu
@ -38,8 +38,8 @@
 *
 */
 #include <stdio.h>
 #include <cooperative_groups.h>
 #include <stdio.h>
 using namespace cooperative_groups;
@ -49,7 +49,8 @@ using namespace cooperative_groups;
 * calculates the sum of val across the group g. The workspace array, x,
 * must be large enough to contain g.size() integers.
 */
-__device__ int sumReduction(thread_group g, int *x, int val) {
+__device__ int sumReduction(thread_group g, int *x, int val)
 {
    // rank of this thread in the group
    int lane = g.thread_rank();
@ -85,7 +86,8 @@ __device__ int sumReduction(thread_group g, int *x, int val) {
 *
 * Creates cooperative groups and performs reductions
 */
-__global__ void cgkernel() {
+__global__ void cgkernel()
 {
    // threadBlockGroup includes all threads in the block
    thread_block threadBlockGroup     = this_thread_block();
    int          threadBlockGroupSize = threadBlockGroup.size();
@ -107,24 +109,22 @@ __global__ void cgkernel() {
    // master thread in group prints out result
    if (threadBlockGroup.thread_rank() == 0) {
-    printf(
+        printf(" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
-        " Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
+               (int)threadBlockGroup.size() - 1,
-        (int)threadBlockGroup.size() - 1, output, expectedOutput);
+               output,
               expectedOutput);
-    printf(" Now creating %d groups, each of size 16 threads:\n\n",
+        printf(" Now creating %d groups, each of size 16 threads:\n\n", (int)threadBlockGroup.size() / 16);
           (int)threadBlockGroup.size() / 16);
    }
    threadBlockGroup.sync();
    // each tiledPartition16 group includes 16 threads
-  thread_block_tile<16> tiledPartition16 =
+    thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup);
      tiled_partition<16>(threadBlockGroup);
    // This offset allows each group to have its own unique area in the workspace
    // array
-  int workspaceOffset =
+    int workspaceOffset = threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
      threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
    // input to reduction, for each thread, is its' rank in the group
    input = tiledPartition16.thread_rank();
@ -138,10 +138,10 @@ __global__ void cgkernel() {
    // each master thread prints out result
    if (tiledPartition16.thread_rank() == 0)
-    printf(
+        printf("   Sum of all ranks 0..15 in this tiledPartition16 group is %d "
        "   Sum of all ranks 0..15 in this tiledPartition16 group is %d "
               "(expected %d)\n",
-        output, expectedOutput);
+               output,
               expectedOutput);
    return;
 }
@ -149,7 +149,8 @@ __global__ void cgkernel() {
 /**
 * Host main routine
 */
-int main() {
+int main()
 {
    // Error code to check return values for CUDA calls
    cudaError_t err;
@ -166,8 +167,7 @@ int main() {
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to launch kernel (error code %s)!\n",
+        fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err));
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
--- a/Samples/0_Introduction/simpleCubemapTexture/simpleCubemapTexture.cu
+++ b/Samples/0_Introduction/simpleCubemapTexture/simpleCubemapTexture.cu
@ -36,17 +36,17 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes CUDA
 #include <cuda_runtime.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 static const char *sSDKname = "simpleCubemapTexture";
@ -56,8 +56,8 @@ static const char *sSDKname = "simpleCubemapTexture";
 //! Transform a cubemap face of a linear buffe using cubemap texture lookups
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *g_odata, int width,
+__global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
-                                cudaTextureObject_t tex) {
+{
    // calculate this thread's data point
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -110,15 +110,15 @@ __global__ void transformKernel(float *g_odata, int width,
        }
        // read from texture, do expected transformation and write to global memory
-    g_odata[face * width * width + y * width + x] =
+        g_odata[face * width * width + y * width + x] = -texCubemap<float>(tex, cx, cy, cz);
        -texCubemap<float>(tex, cx, cy, cz);
    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    // use command-line specified CUDA device, otherwise use device with highest
    // Gflops/s
    int devID = findCudaDevice(argc, (const char **)argv);
@ -129,13 +129,11 @@ int main(int argc, char **argv) {
    cudaDeviceProp deviceProps;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-  printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
+    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
         deviceProps.multiProcessorCount);
    printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
    if (deviceProps.major < 2) {
-    printf(
+        printf("%s requires SM 2.0 or higher for support of Texture Arrays.  Test "
        "%s requires SM 2.0 or higher for support of Texture Arrays.  Test "
               "will exit... \n",
               sSDKname);
@ -157,8 +155,7 @@ int main(int argc, char **argv) {
    for (unsigned int layer = 0; layer < num_layers; layer++) {
        for (int i = 0; i < (int)(cubemap_size); i++) {
-      h_data_ref[layer * cubemap_size + i] =
+            h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer;
          -h_data[layer * cubemap_size + i] + layer;
        }
    }
@ -167,19 +164,16 @@ int main(int argc, char **argv) {
    checkCudaErrors(cudaMalloc((void **)&d_data, size));
    // allocate array and copy image data
-  cudaChannelFormatDesc channelDesc =
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray            *cu_3darray;
    //    checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
    //    make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
-  checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
+    checkCudaErrors(
-                                    make_cudaExtent(width, width, num_faces),
+        cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap));
                                    cudaArrayCubemap));
    cudaMemcpy3DParms myparms = {0};
    myparms.srcPos            = make_cudaPos(0, 0, 0);
    myparms.dstPos            = make_cudaPos(0, 0, 0);
-  myparms.srcPtr =
+    myparms.srcPtr            = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
      make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
    myparms.dstArray          = cu_3darray;
    myparms.extent            = make_cudaExtent(width, width, num_faces);
    myparms.kind              = cudaMemcpyHostToDevice;
@ -207,10 +201,12 @@ int main(int argc, char **argv) {
    dim3 dimBlock(8, 8, 1);
    dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
-  printf(
+    printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
      "Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
           "block has 8 x 8 threads\n",
-      width, num_layers, dimGrid.x, dimGrid.y);
+           width,
           num_layers,
           dimGrid.x,
           dimGrid.y);
    transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
                                           tex); // warmup (for better timing)
@ -233,8 +229,7 @@ int main(int argc, char **argv) {
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer);
    printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
-  printf("%.2f Mtexlookups/sec\n",
+    printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
         (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
    sdkDeleteTimer(&timer);
    // allocate mem for the result on host side
@ -245,14 +240,13 @@ int main(int argc, char **argv) {
    // write regression file if necessary
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
        // write file for regression test
-    sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f,
+        sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f, false);
-                        false);
+    }
-  } else {
+    else {
        printf("Comparing kernel output to expected data\n");
 #define MIN_EPSILON_ERROR 5e-3f
-    bResult =
+        bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
        compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
    }
    // cleanup memory
--- a/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp
+++ b/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp
@ -33,12 +33,12 @@
 */
 // Includes
 #include <cstring>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <iostream>
 #include <stdio.h>
 #include <string.h>
 #include <cstring>
 #include <iostream>
 // includes, project
 #include <helper_cuda.h>
@ -66,11 +66,10 @@ int CleanupNoFailure(CUcontext &cuContext);
 void RandomInit(float *, int);
 bool findModulePath(const char *, string &, char **, ostringstream &);
-static void check(CUresult result, char const *const func,
+static void check(CUresult result, char const *const func, const char *const file, int const line)
-                  const char *const file, int const line) {
+{
    if (result) {
-    fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
+        fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, static_cast<unsigned int>(result), func);
            static_cast<unsigned int>(result), func);
        exit(EXIT_FAILURE);
    }
 }
@ -78,7 +77,8 @@ static void check(CUresult result, char const *const func,
 #define checkCudaDrvErrors(val) check((val), #val, __FILE__, __LINE__)
 // Host code
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("simpleDrvRuntime..\n");
    int        N = 50000, devID = 0;
    size_t     size = N * sizeof(float);
@ -100,7 +100,8 @@ int main(int argc, char **argv) {
    if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) {
        exit(EXIT_FAILURE);
-  } else {
+    }
    else {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }
@ -113,8 +114,7 @@ int main(int argc, char **argv) {
    checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
    // Get function handle from module
-  checkCudaDrvErrors(
+    checkCudaDrvErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
      cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
    // Allocate input vectors h_A and h_B in host memory
    checkCudaErrors(cudaMallocHost(&h_A, size));
@ -133,10 +133,8 @@ int main(int argc, char **argv) {
    cudaStream_t stream;
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    // Copy vectors from host memory to device memory
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
-      cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
  checkCudaErrors(
      cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
    int threadsPerBlock = 256;
    int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
@ -144,14 +142,12 @@ int main(int argc, char **argv) {
    void *args[] = {&d_A, &d_B, &d_C, &N};
    // Launch the CUDA kernel
-  checkCudaDrvErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
+    checkCudaDrvErrors(
-                                    threadsPerBlock, 1, 1, 0, stream, args,
+        cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, stream, args, NULL));
                                    NULL));
    // Copy result from device memory to host memory
    // h_C contains the result in host memory
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
      cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
    checkCudaErrors(cudaStreamSynchronize(stream));
    // Verify result
    int i;
@ -171,7 +167,8 @@ int main(int argc, char **argv) {
    exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-int CleanupNoFailure(CUcontext &cuContext) {
+int CleanupNoFailure(CUcontext &cuContext)
 {
    // Free device memory
    checkCudaErrors(cudaFree(d_A));
    checkCudaErrors(cudaFree(d_B));
@ -195,19 +192,21 @@ int CleanupNoFailure(CUcontext &cuContext) {
    return EXIT_SUCCESS;
 }
 // Allocates an array with random float entries.
-void RandomInit(float *data, int n) {
+void RandomInit(float *data, int n)
 {
    for (int i = 0; i < n; ++i) {
        data[i] = rand() / (float)RAND_MAX;
    }
 }
-bool inline findModulePath(const char *module_file, string &module_path,
+bool inline findModulePath(const char *module_file, string &module_path, char **argv, ostringstream &ostrm)
-                           char **argv, ostringstream &ostrm) {
+{
    char *actual_path = sdkFindFilePath(module_file, argv[0]);
    if (actual_path) {
        module_path = actual_path;
-  } else {
+    }
    else {
        printf("> findModulePath file not found: <%s> \n", module_file);
        return false;
    }
@ -215,7 +214,8 @@ bool inline findModulePath(const char *module_file, string &module_path,
    if (module_path.empty()) {
        printf("> findModulePath could not find file: <%s> \n", module_file);
        return false;
-  } else {
+    }
    else {
        printf("> findModulePath found file at <%s>\n", module_path.c_str());
        if (module_path.rfind("fatbin") != string::npos) {
            ifstream fileIn(module_path.c_str(), ios::binary);
--- a/Samples/0_Introduction/simpleDrvRuntime/vectorAdd_kernel.cu
+++ b/Samples/0_Introduction/simpleDrvRuntime/vectorAdd_kernel.cu
@ -34,9 +34,10 @@
 */
 // Device code
-extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
+extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
-                                         float *C, int N) {
+{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < N) C[i] = A[i] + B[i];
+    if (i < N)
        C[i] = A[i] + B[i];
 }
--- a/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu
+++ b/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu
@ -44,7 +44,8 @@ const char *sSDKsample = "hyperQ";
 // This subroutine does no real work but runs for at least the specified number
 // of clock ticks.
-__device__ void clock_block(clock_t *d_o, clock_t clock_count) {
+__device__ void clock_block(clock_t *d_o, clock_t clock_count)
 {
    unsigned int start_clock = (unsigned int)clock();
    clock_t clock_offset = 0;
@ -71,15 +72,12 @@ __device__ void clock_block(clock_t *d_o, clock_t clock_count) {
 // We create two identical kernels calling clock_block(), we create two so that
 // we can identify dependencies in the profile timeline ("kernel_B" is always
 // dependent on "kernel_A" in the same stream).
-__global__ void kernel_A(clock_t *d_o, clock_t clock_count) {
+__global__ void kernel_A(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
-  clock_block(d_o, clock_count);
+__global__ void kernel_B(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
 }
 __global__ void kernel_B(clock_t *d_o, clock_t clock_count) {
  clock_block(d_o, clock_count);
 }
 // Single-warp reduction kernel (note: this is not optimized for simplicity)
-__global__ void sum(clock_t *d_clocks, int N) {
+__global__ void sum(clock_t *d_clocks, int N)
 {
    // Handle to thread block group
    cg::thread_block   cta = cg::this_thread_block();
    __shared__ clock_t s_clocks[32];
@ -106,7 +104,8 @@ __global__ void sum(clock_t *d_clocks, int N) {
    }
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    int   nstreams    = 32; // One stream for each pair of kernels
    float kernel_time = 10; // Time each kernel should run in ms
    float elapsed_time;
@ -131,18 +130,20 @@ int main(int argc, char **argv) {
    // HyperQ is available in devices of Compute Capability 3.5 and higher
    if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
        if (deviceProp.concurrentKernels == 0) {
-      printf(
+            printf("> GPU does not support concurrent kernel execution (SM 3.5 or "
          "> GPU does not support concurrent kernel execution (SM 3.5 or "
                   "higher required)\n");
            printf("  CUDA kernel runs will be serialized\n");
-    } else {
+        }
        else {
            printf("> GPU does not support HyperQ\n");
            printf("  CUDA kernel runs will have limited concurrency\n");
        }
    }
    printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
-         deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+           deviceProp.major,
           deviceProp.minor,
           deviceProp.multiProcessorCount);
    // Allocate host memory for the output (reduced to a single value)
    clock_t *a = 0;
@ -153,8 +154,7 @@ int main(int argc, char **argv) {
    checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t)));
    // Allocate and initialize an array of stream handles
-  cudaStream_t *streams =
+    cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
      (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
    for (int i = 0; i < nstreams; i++) {
        checkCudaErrors(cudaStreamCreate(&(streams[i])));
@ -203,15 +203,15 @@ int main(int argc, char **argv) {
    checkCudaErrors(cudaEventSynchronize(stop_event));
    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
-  printf(
+    printf("Expected time for serial execution of %d sets of kernels is between "
      "Expected time for serial execution of %d sets of kernels is between "
           "approx. %.3fs and %.3fs\n",
-      nstreams, (nstreams + 1) * kernel_time / 1000.0f,
+           nstreams,
           (nstreams + 1) * kernel_time / 1000.0f,
           2 * nstreams * kernel_time / 1000.0f);
-  printf(
+    printf("Expected time for fully concurrent execution of %d sets of kernels is "
      "Expected time for fully concurrent execution of %d sets of kernels is "
           "approx. %.3fs\n",
-      nstreams, 2 * kernel_time / 1000.0f);
+           nstreams,
           2 * kernel_time / 1000.0f);
    printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
    bool bTestResult = (a[0] >= total_clocks);
--- a/Samples/0_Introduction/simpleIPC/README.md
+++ b/Samples/0_Introduction/simpleIPC/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## References (for more details)
--- a/Samples/0_Introduction/simpleIPC/simpleIPC.cu
+++ b/Samples/0_Introduction/simpleIPC/simpleIPC.cu
@ -32,6 +32,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <vector>
 #include "helper_cuda.h"
 #include "helper_multiprocess.h"
 static const char shmName[] = "simpleIPCshm";
@ -49,7 +50,8 @@ static const char shmName[] = "simpleIPCshm";
 #error Unsupported system
 #endif
-typedef struct shmStruct_st {
+typedef struct shmStruct_st
 {
    size_t               nprocesses;
    int                  barrier;
    int                  sense;
@ -58,15 +60,16 @@ typedef struct shmStruct_st {
    cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
 } shmStruct;
-__global__ void simpleKernel(char *ptr, int sz, char val) {
+__global__ void simpleKernel(char *ptr, int sz, char val)
 {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
        ptr[idx] = val;
    }
 }
-static void barrierWait(volatile int *barrier, volatile int *sense,
+static void barrierWait(volatile int *barrier, volatile int *sense, unsigned int n)
-                        unsigned int n) {
+{
    int count;
    // Check-in
@ -84,7 +87,8 @@ static void barrierWait(volatile int *barrier, volatile int *sense,
        ;
 }
-static void childProcess(int id) {
+static void childProcess(int id)
 {
    volatile shmStruct      *shm = NULL;
    cudaStream_t             stream;
    sharedMemoryInfo         info;
@ -108,8 +112,7 @@ static void childProcess(int id) {
    checkCudaErrors(cudaSetDevice(shm->devices[id]));
    checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks, simpleKernel, threads, 0));
      &blocks, simpleKernel, threads, 0));
    blocks *= prop.multiProcessorCount;
    // Open and track all the allocations and events created in the master
@ -121,10 +124,8 @@ static void childProcess(int id) {
        // Notice, we don't need to explicitly enable peer access for
        // allocations on other devices.
        checkCudaErrors(
-        cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i],
+            cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i], cudaIpcMemLazyEnablePeerAccess));
-                             cudaIpcMemLazyEnablePeerAccess));
+        checkCudaErrors(cudaIpcOpenEventHandle(&event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
    checkCudaErrors(cudaIpcOpenEventHandle(
        &event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
        ptrs.push_back(ptr);
        events.push_back(event);
@ -141,8 +142,7 @@ static void childProcess(int id) {
        // Wait for the buffer to be accessed to be ready
        checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
        // Push a simple kernel on it
-    simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId],
+        simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId], DATA_SIZE, id);
                                                 DATA_SIZE, id);
        checkCudaErrors(cudaGetLastError());
        // Signal that this buffer is ready for the next consumer
        checkCudaErrors(cudaEventRecord(events[bufferId], stream));
@ -158,8 +158,7 @@ static void childProcess(int id) {
    // Now wait for my buffer to be ready so I can copy it locally and verify it
    checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
-  checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE,
+    checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, cudaMemcpyDeviceToHost, stream));
                                  cudaMemcpyDeviceToHost, stream));
    // And wait for all the queued up work to complete
    checkCudaErrors(cudaStreamSynchronize(stream));
@ -169,8 +168,11 @@ static void childProcess(int id) {
    char compareId = (char)((id + 1) % procCount);
    for (unsigned long long j = 0; j < DATA_SIZE; j++) {
        if (verification_buffer[j] != compareId) {
-      printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j,
+            printf("Process %d: Verification mismatch at %lld: %d != %d\n",
-             (int)verification_buffer[j], (int)compareId);
+                   id,
                   j,
                   (int)verification_buffer[j],
                   (int)compareId);
        }
    }
@ -185,7 +187,8 @@ static void childProcess(int id) {
    printf("Process %d complete!\n", id);
 }
-static void parentProcess(char *app) {
+static void parentProcess(char *app)
 {
    sharedMemoryInfo         info;
    int                      devCount, i;
    volatile shmStruct      *shm = NULL;
@ -219,17 +222,14 @@ static void parentProcess(char *app) {
        // This sample requires two processes accessing each device, so we need
        // to ensure exclusive or prohibited mode is not set
        if (prop.computeMode != cudaComputeModeDefault) {
-      printf("Device %d is in an unsupported compute mode for this sample\n",
+            printf("Device %d is in an unsupported compute mode for this sample\n", i);
             i);
            continue;
        }
        for (int j = 0; j < shm->nprocesses; j++) {
            int canAccessPeerIJ, canAccessPeerJI;
-      checkCudaErrors(
+            checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
-          cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
+            checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
      checkCudaErrors(
          cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
            if (!canAccessPeerIJ || !canAccessPeerJI) {
                allPeers = false;
                break;
@ -246,10 +246,11 @@ static void parentProcess(char *app) {
                checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
            }
            shm->devices[shm->nprocesses++] = i;
-      if (shm->nprocesses >= MAX_DEVICES) break;
+            if (shm->nprocesses >= MAX_DEVICES)
-    } else {
+                break;
-      printf(
+        }
-          "Device %d is not peer capable with some other selected peers, "
+        else {
            printf("Device %d is not peer capable with some other selected peers, "
                   "skipping\n",
                   i);
        }
@ -268,12 +269,9 @@ static void parentProcess(char *app) {
        checkCudaErrors(cudaSetDevice(shm->devices[i]));
        checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
-    checkCudaErrors(
+        checkCudaErrors(cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
-        cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
+        checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming | cudaEventInterprocess));
-    checkCudaErrors(cudaEventCreate(
+        checkCudaErrors(cudaIpcGetEventHandle((cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
        &event, cudaEventDisableTiming | cudaEventInterprocess));
    checkCudaErrors(cudaIpcGetEventHandle(
        (cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
        ptrs.push_back(ptr);
        events.push_back(event);
@ -314,14 +312,16 @@ static void parentProcess(char *app) {
    sharedMemoryClose(&info);
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
 #if defined(__arm__) || defined(__aarch64__)
    printf("Not supported on ARM\n");
    return EXIT_WAIVED;
 #else
    if (argc == 1) {
        parentProcess(argv[0]);
-  } else {
+    }
    else {
        childProcess(atoi(argv[1]));
    }
    return EXIT_SUCCESS;
--- a/Samples/0_Introduction/simpleLayeredTexture/simpleLayeredTexture.cu
+++ b/Samples/0_Introduction/simpleLayeredTexture/simpleLayeredTexture.cu
@ -36,10 +36,10 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes, kernels
 #include <cuda_runtime.h>
@ -54,8 +54,8 @@ static const char *sSDKname = "simpleLayeredTexture";
 //! Transform a layer of a layered 2D texture using texture lookups
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *g_odata, int width, int height,
+__global__ void transformKernel(float *g_odata, int width, int height, int layer, cudaTextureObject_t tex)
-                                int layer, cudaTextureObject_t tex) {
+{
    // calculate this thread's data point
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -67,14 +67,14 @@ __global__ void transformKernel(float *g_odata, int width, int height,
    float v = (y + 0.5f) / (float)height;
    // read from texture, do expected transformation and write to global memory
-  g_odata[layer * width * height + y * width + x] =
+    g_odata[layer * width * height + y * width + x] = -tex2DLayered<float>(tex, u, v, layer) + layer;
      -tex2DLayered<float>(tex, u, v, layer) + layer;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("[%s] - Starting...\n", sSDKname);
    // use command-line specified CUDA device, otherwise use device with highest
@ -87,8 +87,7 @@ int main(int argc, char **argv) {
    cudaDeviceProp deviceProps;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-  printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
+    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
         deviceProps.multiProcessorCount);
    printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
    // generate input data for layered texture
@ -106,8 +105,7 @@ int main(int argc, char **argv) {
    for (unsigned int layer = 0; layer < num_layers; layer++)
        for (int i = 0; i < (int)(width * height); i++) {
-      h_data_ref[layer * width * height + i] =
+            h_data_ref[layer * width * height + i] = -h_data[layer * width * height + i] + layer;
          -h_data[layer * width * height + i] + layer;
        }
    // allocate device memory for result
@ -115,17 +113,14 @@ int main(int argc, char **argv) {
    checkCudaErrors(cudaMalloc((void **)&d_data, size));
    // allocate array and copy image data
-  cudaChannelFormatDesc channelDesc =
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray            *cu_3darray;
-  checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
+    checkCudaErrors(
-                                    make_cudaExtent(width, height, num_layers),
+        cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
                                    cudaArrayLayered));
    cudaMemcpy3DParms myparms = {0};
    myparms.srcPos            = make_cudaPos(0, 0, 0);
    myparms.dstPos            = make_cudaPos(0, 0, 0);
-  myparms.srcPtr =
+    myparms.srcPtr            = make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
      make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
    myparms.dstArray          = cu_3darray;
    myparms.extent            = make_cudaExtent(width, height, num_layers);
    myparms.kind              = cudaMemcpyHostToDevice;
@ -152,10 +147,12 @@ int main(int argc, char **argv) {
    dim3 dimBlock(8, 8, 1);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
-  printf(
+    printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
      "Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
           "8 x 8 threads\n",
-      width, height, dimGrid.x, dimGrid.y);
+           width,
           height,
           dimGrid.x,
           dimGrid.y);
    transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0,
                                           tex); // warmup (for better timing)
@ -171,8 +168,7 @@ int main(int argc, char **argv) {
    // execute the kernel
    for (unsigned int layer = 0; layer < num_layers; layer++)
-    transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer,
+        transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer, tex);
                                              tex);
    // check if kernel execution generated an error
    getLastCudaError("Kernel execution failed");
@ -180,9 +176,7 @@ int main(int argc, char **argv) {
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer);
    printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
-  printf("%.2f Mtexlookups/sec\n",
+    printf("%.2f Mtexlookups/sec\n", (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
         (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) /
          1e6));
    sdkDeleteTimer(&timer);
    // allocate mem for the result on host side
@ -193,14 +187,13 @@ int main(int argc, char **argv) {
    // write regression file if necessary
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
        // write file for regression test
-    sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f,
+        sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
-                        false);
+    }
-  } else {
+    else {
        printf("Comparing kernel output to expected data\n");
 #define MIN_EPSILON_ERROR 5e-3f
-    bResult = compareData(h_odata, h_data_ref, width * height * num_layers,
+        bResult = compareData(h_odata, h_data_ref, width * height * num_layers, MIN_EPSILON_ERROR, 0.0f);
                          MIN_EPSILON_ERROR, 0.0f);
    }
    // cleanup memory
--- a/Samples/0_Introduction/simpleMPI/simpleMPI.cpp
+++ b/Samples/0_Introduction/simpleMPI/simpleMPI.cpp
@ -42,8 +42,8 @@
 // System includes
 #include <iostream>
 using std::cout;
 using std::cerr;
 using std::cout;
 using std::endl;
 // User include
@ -58,7 +58,8 @@ using std::endl;
 // Host code
 // No CUDA here, only MPI
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
 {
    // Dimensions of the dataset
    int blockSize       = 256;
    int gridSize        = 10000;
@ -87,8 +88,8 @@ int main(int argc, char *argv[]) {
    float *dataNode = new float[dataSizePerNode];
    // Dispatch a portion of the input data to each node
-  MPI_CHECK(MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode,
+    MPI_CHECK(
-                        dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD));
+        MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD));
    if (commRank == 0) {
        // No need for root data any more
@ -102,8 +103,7 @@ int main(int argc, char *argv[]) {
    float sumNode = sum(dataNode, dataSizePerNode);
    float sumRoot;
-  MPI_CHECK(
+    MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
      MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
    if (commRank == 0) {
        float average = sumRoot / dataSizeTotal;
@ -122,7 +122,8 @@ int main(int argc, char *argv[]) {
 }
 // Shut down MPI cleanly if something goes wrong
-void my_abort(int err) {
+void my_abort(int err)
 {
    cout << "Test FAILED\n";
    MPI_Abort(MPI_COMM_WORLD, err);
 }
--- a/Samples/0_Introduction/simpleMPI/simpleMPI.cu
+++ b/Samples/0_Introduction/simpleMPI/simpleMPI.cu
@ -51,13 +51,15 @@ using std::endl;
 // Device code
 // Very simple GPU Kernel that computes square roots of input numbers
-__global__ void simpleMPIKernel(float *input, float *output) {
+__global__ void simpleMPIKernel(float *input, float *output)
 {
    int tid     = blockIdx.x * blockDim.x + threadIdx.x;
    output[tid] = sqrt(input[tid]);
 }
 // Initialize an array with random data (between 0 and 1)
-void initData(float *data, int dataSize) {
+void initData(float *data, int dataSize)
 {
    for (int i = 0; i < dataSize; i++) {
        data[i] = (float)rand() / RAND_MAX;
    }
@ -65,7 +67,8 @@ void initData(float *data, int dataSize) {
 // CUDA computation on each node
 // No MPI here, only CUDA
-void computeGPU(float *hostData, int blockSize, int gridSize) {
+void computeGPU(float *hostData, int blockSize, int gridSize)
 {
    int dataSize = blockSize * gridSize;
    // Allocate data on GPU memory
@ -76,22 +79,21 @@ void computeGPU(float *hostData, int blockSize, int gridSize) {
    CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
    // Copy to GPU memory
-  CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float),
+    CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));
                        cudaMemcpyHostToDevice));
    // Run kernel
    simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
    // Copy data back to CPU memory
-  CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float),
+    CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), cudaMemcpyDeviceToHost));
                        cudaMemcpyDeviceToHost));
    // Free GPU memory
    CUDA_CHECK(cudaFree(deviceInputData));
    CUDA_CHECK(cudaFree(deviceOutputData));
 }
-float sum(float *data, int size) {
+float sum(float *data, int size)
 {
    float accum = 0.f;
    for (int i = 0; i < size; i++) {
--- a/Samples/0_Introduction/simpleMPI/simpleMPI.h
+++ b/Samples/0_Introduction/simpleMPI/simpleMPI.h
@ -36,7 +36,8 @@
 */
 // Forward declarations
-extern "C" {
+extern "C"
 {
    void  initData(float *data, int dataSize);
    void  computeGPU(float *hostData, int blockSize, int gridSize);
    float sum(float *data, int size);
--- a/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu
+++ b/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu
@ -55,7 +55,8 @@ const char *sSDKname = "simpleMultiCopy";
 // includes, kernels
 // Declare the CUDA kernels here and main() code that is needed to launch
 // Compute workload on the system
-__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) {
+__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps)
 {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
@ -102,7 +103,8 @@ bool test();
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
 {
    int            cuda_device = 0;
    float          scale_factor;
    cudaDeviceProp deviceProp;
@ -115,7 +117,8 @@ int main(int argc, char *argv[]) {
        if (cuda_device < 0) {
            printf("Invalid command line parameters\n");
            exit(EXIT_FAILURE);
-    } else {
+        }
        else {
            printf("cuda_device = %d\n", cuda_device);
            cuda_device = gpuDeviceInit(cuda_device);
@ -124,7 +127,8 @@ int main(int argc, char *argv[]) {
                exit(EXIT_SUCCESS);
            }
        }
-  } else {
+    }
    else {
        // Otherwise pick the device with the highest Gflops/s
        cuda_device = gpuGetMaxGflopsDeviceId();
        checkCudaErrors(cudaSetDevice(cuda_device));
@ -133,22 +137,23 @@ int main(int argc, char *argv[]) {
    }
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
-  printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name,
+    printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
           deviceProp.name,
           deviceProp.multiProcessorCount,
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
-         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
+           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
             deviceProp.multiProcessorCount);
    // Anything that is less than 32 Cores will have scaled down workload
    scale_factor =
-      max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
+        max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
                    (float)deviceProp.multiProcessorCount)),
            1.0f);
    N = (int)((float)N / scale_factor);
    printf("> Device name: %s\n", deviceProp.name);
    printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
-         deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+           deviceProp.major,
           deviceProp.minor,
           deviceProp.multiProcessorCount);
    printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
    printf("> array_size   = %d\n\n", N);
@ -165,13 +170,11 @@ int main(int argc, char *argv[]) {
    h_data_sink   = (int *)malloc(memsize);
    for (int i = 0; i < STREAM_COUNT; ++i) {
-    checkCudaErrors(
+        checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
        cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
        checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
        checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
-    checkCudaErrors(
+        checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
        cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
        checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
        checkCudaErrors(cudaStreamCreate(&stream[i]));
@ -190,8 +193,7 @@ int main(int argc, char *argv[]) {
    // Time copies and kernel
    cudaEventRecord(start, 0);
-  checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
+    checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0));
                                  cudaMemcpyHostToDevice, 0));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
@ -199,8 +201,7 @@ int main(int argc, char *argv[]) {
    cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
    cudaEventRecord(start, 0);
-  checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize,
+    checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0));
                                  cudaMemcpyDeviceToHost, 0));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
@ -217,35 +218,27 @@ int main(int argc, char *argv[]) {
    printf("\n");
    printf("Relevant properties of this CUDA device\n");
-  printf(
+    printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
      "(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
           "(device property \"deviceOverlap\")\n",
           deviceProp.deviceOverlap ? "X" : " ");
    // printf("(%s) Can execute several GPU kernels simultaneously (compute
    // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
-  printf(
+    printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
      "(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
           "    (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
           "4000/5000/6000/K5000)\n",
           (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
    printf("\n");
    printf("Measured timings (throughput):\n");
-  printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time,
+    printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time);
-         (memsize * 1e-6) / memcpy_h2d_time);
+    printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time);
-  printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time,
+    printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, (inner_reps * memsize * 2e-6) / kernel_time);
         (memsize * 1e-6) / memcpy_d2h_time);
  printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time,
         (inner_reps * memsize * 2e-6) / kernel_time);
    printf("\n");
-  printf(
+    printf("Theoretical limits for speedup gained from overlapped data "
      "Theoretical limits for speedup gained from overlapped data "
           "transfers:\n");
-  printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
+    printf("No overlap at all (transfer-kernel-transfer): %f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time);
-         memcpy_h2d_time + memcpy_d2h_time + kernel_time);
+    printf("Compute can overlap with one transfer: %f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
  printf("Compute can overlap with one transfer: %f ms\n",
         max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
    printf("Compute can overlap with both data transfers: %f ms\n",
           max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
@ -254,18 +247,13 @@ int main(int argc, char *argv[]) {
    float overlap_time = processWithStreams(STREAM_COUNT);
    printf("\nAverage measured timings over %d repetitions:\n", nreps);
-  printf(" Avg. time when execution fully serialized\t: %f ms\n",
+    printf(" Avg. time when execution fully serialized\t: %f ms\n", serial_time / nreps);
-         serial_time / nreps);
+    printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT, overlap_time / nreps);
-  printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT,
+    printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n", (serial_time - overlap_time) / nreps);
         overlap_time / nreps);
  printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
         (serial_time - overlap_time) / nreps);
    printf("\nMeasured throughput:\n");
-  printf(" Fully serialized execution\t\t: %f GB/s\n",
+    printf(" Fully serialized execution\t\t: %f GB/s\n", (nreps * (memsize * 2e-6)) / serial_time);
-         (nreps * (memsize * 2e-6)) / serial_time);
+    printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT, (nreps * (memsize * 2e-6)) / overlap_time);
  printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT,
         (nreps * (memsize * 2e-6)) / overlap_time);
    // Verify the results, we will use the results for final output
    bool bResults = test();
@ -293,7 +281,8 @@ int main(int argc, char *argv[]) {
    exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-float processWithStreams(int streams_used) {
+float processWithStreams(int streams_used)
 {
    int current_stream = 0;
    float time;
@ -326,17 +315,17 @@ float processWithStreams(int streams_used) {
            d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);
        // Upload next frame
-    checkCudaErrors(
+        checkCudaErrors(cudaMemcpyAsync(
-        cudaMemcpyAsync(d_data_in[next_stream], h_data_in[next_stream], memsize,
+            d_data_in[next_stream], h_data_in[next_stream], memsize, cudaMemcpyHostToDevice, stream[next_stream]));
                        cudaMemcpyHostToDevice, stream[next_stream]));
        // Download current frame
-    checkCudaErrors(cudaMemcpyAsync(
+        checkCudaErrors(cudaMemcpyAsync(h_data_out[current_stream],
-        h_data_out[current_stream], d_data_out[current_stream], memsize,
+                                        d_data_out[current_stream],
-        cudaMemcpyDeviceToHost, stream[current_stream]));
+                                        memsize,
                                        cudaMemcpyDeviceToHost,
                                        stream[current_stream]));
-    checkCudaErrors(
+        checkCudaErrors(cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
        cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
        current_stream = next_stream;
    }
@ -350,7 +339,8 @@ float processWithStreams(int streams_used) {
    return time;
 }
-void init() {
+void init()
 {
    for (int i = 0; i < N; ++i) {
        h_data_source[i] = 0;
    }
@ -360,7 +350,8 @@ void init() {
    }
 }
-bool test() {
+bool test()
 {
    bool passed = true;
    for (int j = 0; j < STREAM_COUNT; ++j) {
--- a/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.cu
+++ b/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.cu
@ -37,15 +37,15 @@
 */
 // System includes
 #include <stdio.h>
 #include <assert.h>
 #include <stdio.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
@ -64,12 +64,14 @@ const int DATA_N = 1048576 * 32;
 // Refer to the 'reduction' CUDA Sample describing
 // reduction optimization strategies
 ////////////////////////////////////////////////////////////////////////////////
-__global__ static void reduceKernel(float *d_Result, float *d_Input, int N) {
+__global__ static void reduceKernel(float *d_Result, float *d_Input, int N)
 {
    const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
    const int threadN = gridDim.x * blockDim.x;
    float     sum     = 0;
-  for (int pos = tid; pos < N; pos += threadN) sum += d_Input[pos];
+    for (int pos = tid; pos < N; pos += threadN)
        sum += d_Input[pos];
    d_Result[tid] = sum;
 }
@ -77,7 +79,8 @@ __global__ static void reduceKernel(float *d_Result, float *d_Input, int N) {
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    // Solver config
    TGPUplan plan[MAX_GPU_COUNT];
@ -129,14 +132,10 @@ int main(int argc, char **argv) {
        checkCudaErrors(cudaSetDevice(i));
        checkCudaErrors(cudaStreamCreate(&plan[i].stream));
        // Allocate memory
-    checkCudaErrors(
+        checkCudaErrors(cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
-        cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
+        checkCudaErrors(cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
-    checkCudaErrors(
+        checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device, ACCUM_N * sizeof(float)));
-        cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
+        checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data, plan[i].dataN * sizeof(float)));
    checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device,
                                   ACCUM_N * sizeof(float)));
    checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data,
                                   plan[i].dataN * sizeof(float)));
        for (j = 0; j < plan[i].dataN; j++) {
            plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
@ -158,19 +157,16 @@ int main(int argc, char **argv) {
        checkCudaErrors(cudaSetDevice(i));
        // Copy input data from CPU
-    checkCudaErrors(cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data,
+        checkCudaErrors(cudaMemcpyAsync(
-                                    plan[i].dataN * sizeof(float),
+            plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream));
                                    cudaMemcpyHostToDevice, plan[i].stream));
        // Perform GPU computations
-    reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(
+        reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
        plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
        getLastCudaError("reduceKernel() execution failed.\n");
        // Read back GPU results
-    checkCudaErrors(cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum,
+        checkCudaErrors(cudaMemcpyAsync(
-                                    ACCUM_N * sizeof(float),
+            plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream));
                                    cudaMemcpyDeviceToHost, plan[i].stream));
    }
    // Process GPU results
--- a/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.h
+++ b/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.h
@ -37,7 +37,8 @@
 #ifndef SIMPLEMULTIGPU_H
 #define SIMPLEMULTIGPU_H
-typedef struct {
+typedef struct
 {
    // Host-side input data
    int    dataN;
    float *h_Data;
@ -56,7 +57,6 @@ typedef struct {
 } TGPUplan;
-extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N,
+extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t &s);
                                    int BLOCK_N, int THREAD_N, cudaStream_t &s);
 #endif
--- a/Samples/0_Introduction/simpleOccupancy/simpleOccupancy.cu
+++ b/Samples/0_Introduction/simpleOccupancy/simpleOccupancy.cu
@ -25,8 +25,8 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <iostream>
 #include <helper_cuda.h> // helper functions for CUDA error check
 #include <iostream>
 const int manualBlockSize = 32;
@ -38,7 +38,8 @@ const int manualBlockSize = 32;
 // execution configuration, including anything the launch configurator
 // API suggests.
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void square(int *array, int arrayCount) {
+__global__ void square(int *array, int arrayCount)
 {
    extern __shared__ int dynamicSmem[];
    int                   idx = threadIdx.x + blockIdx.x * blockDim.x;
@ -58,8 +59,8 @@ __global__ void square(int *array, int arrayCount) {
 // This wrapper routine computes the occupancy of kernel, and reports
 // it in terms of active warps / maximum warps per SM.
 ////////////////////////////////////////////////////////////////////////////////
-static double reportPotentialOccupancy(void *kernel, int blockSize,
+static double reportPotentialOccupancy(void *kernel, int blockSize, size_t dynamicSMem)
-                                       size_t dynamicSMem) {
+{
    int            device;
    cudaDeviceProp prop;
@ -72,8 +73,7 @@ static double reportPotentialOccupancy(void *kernel, int blockSize,
    checkCudaErrors(cudaGetDevice(&device));
    checkCudaErrors(cudaGetDeviceProperties(&prop, device));
-  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, blockSize, dynamicSMem));
      &numBlocks, kernel, blockSize, dynamicSMem));
    activeWarps = numBlocks * blockSize / prop.warpSize;
    maxWarps    = prop.maxThreadsPerMultiProcessor / prop.warpSize;
@ -99,7 +99,8 @@ static double reportPotentialOccupancy(void *kernel, int blockSize,
 // This function configures the launch based on the "automatic"
 // argument, records the runtime, and reports occupancy and runtime.
 ////////////////////////////////////////////////////////////////////////////////
-static int launchConfig(int *array, int arrayCount, bool automatic) {
+static int launchConfig(int *array, int arrayCount, bool automatic)
 {
    int    blockSize;
    int    minGridSize;
    int    gridSize;
@ -116,14 +117,13 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
    checkCudaErrors(cudaEventCreate(&end));
    if (automatic) {
-    checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
+        checkCudaErrors(
-        &minGridSize, &blockSize, (void *)square, dynamicSMemUsage,
+            cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)square, dynamicSMemUsage, arrayCount));
        arrayCount));
        std::cout << "Suggested block size: " << blockSize << std::endl
-              << "Minimum grid size for maximum occupancy: " << minGridSize
+                  << "Minimum grid size for maximum occupancy: " << minGridSize << std::endl;
-              << std::endl;
+    }
-  } else {
+    else {
        // This block size is too small. Given limited number of
        // active blocks per multiprocessor, the number of active
        // threads will be limited, and thus unable to achieve maximum
@ -146,11 +146,9 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
    // Calculate occupancy
    //
-  potentialOccupancy =
+    potentialOccupancy = reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
      reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
-  std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%"
+    std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%" << std::endl;
            << std::endl;
    // Report elapsed time
    //
@ -166,7 +164,8 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
 // The test generates an array and squares it with a CUDA kernel, then
 // verifies the result.
 ////////////////////////////////////////////////////////////////////////////////
-static int test(bool automaticLaunchConfig, const int count = 1000000) {
+static int test(bool automaticLaunchConfig, const int count = 1000000)
 {
    int *array;
    int *dArray;
    int  size = count * sizeof(int);
@ -193,8 +192,7 @@ static int test(bool automaticLaunchConfig, const int count = 1000000) {
    //
    for (int i = 0; i < count; i += 1) {
        if (array[i] != i * i) {
-      std::cout << "element " << i << " expected " << i * i << " actual "
+            std::cout << "element " << i << " expected " << i * i << " actual " << array[i] << std::endl;
                << array[i] << std::endl;
            return 1;
        }
    }
@ -210,13 +208,13 @@ static int test(bool automaticLaunchConfig, const int count = 1000000) {
 // automatically configured launch, and reports the occupancy and
 // performance.
 ////////////////////////////////////////////////////////////////////////////////
-int main() {
+int main()
 {
    int status;
    std::cout << "starting Simple Occupancy" << std::endl << std::endl;
-  std::cout << "[ Manual configuration with " << manualBlockSize
+    std::cout << "[ Manual configuration with " << manualBlockSize << " threads per block ]" << std::endl;
            << " threads per block ]" << std::endl;
    status = test(false);
    if (status) {
--- a/Samples/0_Introduction/simpleP2P/README.md
+++ b/Samples/0_Introduction/simpleP2P/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## References (for more details)
--- a/Samples/0_Introduction/simpleP2P/simpleP2P.cu
+++ b/Samples/0_Introduction/simpleP2P/simpleP2P.cu
@ -31,8 +31,8 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdlib.h>
 // CUDA includes
 #include <cuda_runtime.h>
@ -41,7 +41,8 @@
 #include <helper_cuda.h>
 #include <helper_functions.h> // helper for shared that are common to CUDA Samples
-__global__ void SimpleKernel(float *src, float *dst) {
+__global__ void SimpleKernel(float *src, float *dst)
 {
    // Just a dummy kernel, doing enough for us to verify that everything
    // worked
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
@ -50,12 +51,12 @@ __global__ void SimpleKernel(float *src, float *dst) {
 inline bool IsAppBuiltAs64() { return sizeof(void *) == 8; }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("[%s] - Starting...\n", argv[0]);
    if (!IsAppBuiltAs64()) {
-    printf(
+        printf("%s is only supported with on 64-bit OSs and the application must be "
        "%s is only supported with on 64-bit OSs and the application must be "
               "built as a 64-bit target.  Test is being waived.\n",
               argv[0]);
        exit(EXIT_WAIVED);
@ -68,8 +69,7 @@ int main(int argc, char **argv) {
    printf("CUDA-capable device count: %i\n", gpu_n);
    if (gpu_n < 2) {
-    printf(
+        printf("Two or more GPUs with Peer-to-Peer access capability are required for "
        "Two or more GPUs with Peer-to-Peer access capability are required for "
               "%s.\n",
               argv[0]);
        printf("Waiving test.\n");
@ -97,8 +97,12 @@ int main(int argc, char **argv) {
                continue;
            }
            checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
-      printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[i].name,
+            printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
-             i, prop[j].name, j, can_access_peer ? "Yes" : "No");
+                   prop[i].name,
                   i,
                   prop[j].name,
                   j,
                   can_access_peer ? "Yes" : "No");
            if (can_access_peer && p2pCapableGPUs[0] == -1) {
                p2pCapableGPUs[0] = i;
                p2pCapableGPUs[1] = j;
@ -107,12 +111,10 @@ int main(int argc, char **argv) {
    }
    if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
-    printf(
+        printf("Two or more GPUs with Peer-to-Peer access capability are required for "
        "Two or more GPUs with Peer-to-Peer access capability are required for "
               "%s.\n",
               argv[0]);
-    printf(
+        printf("Peer to Peer access is not available amongst GPUs in the system, "
        "Peer to Peer access is not available amongst GPUs in the system, "
               "waiving test.\n");
        exit(EXIT_WAIVED);
@ -123,8 +125,7 @@ int main(int argc, char **argv) {
    gpuid[1] = p2pCapableGPUs[1];
    // Enable peer access
-  printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0],
+    printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0], gpuid[1]);
         gpuid[1]);
    checkCudaErrors(cudaSetDevice(gpuid[0]));
    checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
    checkCudaErrors(cudaSetDevice(gpuid[1]));
@ -132,8 +133,8 @@ int main(int argc, char **argv) {
    // Allocate buffers
    const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
-  printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n",
+    printf(
-         int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
+        "Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
    checkCudaErrors(cudaSetDevice(gpuid[0]));
    float *g0;
    checkCudaErrors(cudaMalloc(&g0, buf_size));
@ -141,8 +142,7 @@ int main(int argc, char **argv) {
    float *g1;
    checkCudaErrors(cudaMalloc(&g1, buf_size));
    float *h0;
-  checkCudaErrors(
+    checkCudaErrors(cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA
      cudaMallocHost(&h0, buf_size));  // Automatically portable with UVA
    // Create CUDA event handles
    printf("Creating event handles...\n");
@ -161,7 +161,8 @@ int main(int argc, char **argv) {
        // Ping-pong copy between GPUs
        if (i % 2 == 0) {
            checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
-    } else {
+        }
        else {
            checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
        }
    }
@ -170,9 +171,9 @@ int main(int argc, char **argv) {
    checkCudaErrors(cudaEventSynchronize(stop_event));
    checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
    printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
-         gpuid[0], gpuid[1],
+           gpuid[0],
-         (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f /
+           gpuid[1],
-             1024.0f / 1024.0f);
+           (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);
    // Prepare host buffer and copy to GPU 0
    printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);
@ -190,10 +191,11 @@ int main(int argc, char **argv) {
    // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
    // output to the GPU 1 buffer
-  printf(
+    printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
      "Run kernel on GPU%d, taking source data from GPU%d and writing to "
           "GPU%d...\n",
-      gpuid[1], gpuid[0], gpuid[1]);
+           gpuid[1],
           gpuid[0],
           gpuid[1]);
    checkCudaErrors(cudaSetDevice(gpuid[1]));
    SimpleKernel<<<blocks, threads>>>(g0, g1);
@ -201,10 +203,11 @@ int main(int argc, char **argv) {
    // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
    // output to the GPU 0 buffer
-  printf(
+    printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
      "Run kernel on GPU%d, taking source data from GPU%d and writing to "
           "GPU%d...\n",
-      gpuid[0], gpuid[1], gpuid[0]);
+           gpuid[0],
           gpuid[1],
           gpuid[0]);
    checkCudaErrors(cudaSetDevice(gpuid[0]));
    SimpleKernel<<<blocks, threads>>>(g1, g0);
@ -220,8 +223,7 @@ int main(int argc, char **argv) {
        // Re-generate input data and apply 2x '* 2.0f' computation of both
        // kernel runs
        if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
-      printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i],
+            printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i], (float(i % 4096) * 2.0f * 2.0f));
             (float(i % 4096) * 2.0f * 2.0f));
            if (error_count++ > 10) {
                break;
@ -253,7 +255,8 @@ int main(int argc, char **argv) {
    if (error_count != 0) {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
-  } else {
+    }
    else {
        printf("Test passed\n");
        exit(EXIT_SUCCESS);
    }
--- a/Samples/0_Introduction/simplePitchLinearTexture/simplePitchLinearTexture.cu
+++ b/Samples/0_Introduction/simplePitchLinearTexture/simplePitchLinearTexture.cu
@ -70,29 +70,26 @@ bool bTestResult = true;
 //! Shifts matrix elements using pitch linear array
 //! @param odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void shiftPitchLinear(float *odata, int pitch, int width, int height,
+__global__ void
-                                 int shiftX, int shiftY,
+shiftPitchLinear(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefPL)
-                                 cudaTextureObject_t texRefPL) {
+{
    int xid = blockIdx.x * blockDim.x + threadIdx.x;
    int yid = blockIdx.y * blockDim.y + threadIdx.y;
-  odata[yid * pitch + xid] = tex2D<float>(
+    odata[yid * pitch + xid] = tex2D<float>(texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
      texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Shifts matrix elements using regular array
 //! @param odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void shiftArray(float *odata, int pitch, int width, int height,
+__global__ void
-                           int shiftX, int shiftY,
+shiftArray(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefArray)
-                           cudaTextureObject_t texRefArray) {
+{
    int xid = blockIdx.x * blockDim.x + threadIdx.x;
    int yid = blockIdx.y * blockDim.y + threadIdx.y;
-  odata[yid * pitch + xid] =
+    odata[yid * pitch + xid] = tex2D<float>(texRefArray, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
      tex2D<float>(texRefArray, (xid + shiftX) / (float)width,
                   (yid + shiftY) / (float)height);
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -102,20 +99,21 @@ void runTest(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("%s starting...\n\n", sSDKsample);
    runTest(argc, argv);
-  printf("%s completed, returned %s\n", sSDKsample,
+    printf("%s completed, returned %s\n", sSDKsample, bTestResult ? "OK" : "ERROR!");
         bTestResult ? "OK" : "ERROR!");
    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
 {
    // Set array size
    const int nx = 2048;
    const int ny = 2048;
@ -154,8 +152,7 @@ void runTest(int argc, char **argv) {
    float *d_idataPL;
    size_t d_pitchBytes;
-  checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes,
+    checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, nx * sizeof(float), ny));
                                  nx * sizeof(float), ny));
    // Array input data
    cudaArray            *d_idataArray;
@ -165,20 +162,17 @@ void runTest(int argc, char **argv) {
    // Pitch linear output data
    float *d_odata;
-  checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes,
+    checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes, nx * sizeof(float), ny));
                                  nx * sizeof(float), ny));
    // Copy host data to device
    // Pitch linear
    size_t h_pitchBytes = nx * sizeof(float);
-  checkCudaErrors(cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes,
+    checkCudaErrors(
-                               nx * sizeof(float), ny, cudaMemcpyHostToDevice));
+        cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, nx * sizeof(float), ny, cudaMemcpyHostToDevice));
    // Array
-  checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata,
+    checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata, nx * ny * sizeof(float), cudaMemcpyHostToDevice));
                                    nx * ny * sizeof(float),
                                    cudaMemcpyHostToDevice));
    cudaTextureObject_t texRefPL;
    cudaTextureObject_t texRefArray;
@ -210,8 +204,7 @@ void runTest(int argc, char **argv) {
    texDescr.addressMode[0]   = cudaAddressModeWrap;
    texDescr.addressMode[1]   = cudaAddressModeWrap;
    texDescr.readMode         = cudaReadModeElementType;
-  checkCudaErrors(
+    checkCudaErrors(cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
      cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
    // Reference calculation
    for (int j = 0; j < ny; ++j) {
@ -224,15 +217,13 @@ void runTest(int argc, char **argv) {
    }
    // Run ShiftPitchLinear kernel
-  checkCudaErrors(
+    checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
      cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
    checkCudaErrors(cudaEventRecord(start, 0));
    for (int i = 0; i < NUM_REPS; ++i) {
-    shiftPitchLinear<<<dimGrid, dimBlock>>>(d_odata,
+        shiftPitchLinear<<<dimGrid, dimBlock>>>(
-                                            (int)(d_pitchBytes / sizeof(float)),
+            d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefPL);
                                            nx, ny, x_shift, y_shift, texRefPL);
    }
    checkCudaErrors(cudaEventRecord(stop, 0));
@ -241,8 +232,8 @@ void runTest(int argc, char **argv) {
    checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
    // Check results
-  checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
+    checkCudaErrors(
-                               nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
+        cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
    bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
@ -254,14 +245,12 @@ void runTest(int argc, char **argv) {
    }
    // Run ShiftArray kernel
-  checkCudaErrors(
+    checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
      cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
    checkCudaErrors(cudaEventRecord(start, 0));
    for (int i = 0; i < NUM_REPS; ++i) {
-    shiftArray<<<dimGrid, dimBlock>>>(d_odata,
+        shiftArray<<<dimGrid, dimBlock>>>(
-                                      (int)(d_pitchBytes / sizeof(float)), nx,
+            d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefArray);
                                      ny, x_shift, y_shift, texRefArray);
    }
    checkCudaErrors(cudaEventRecord(stop, 0));
@ -270,8 +259,8 @@ void runTest(int argc, char **argv) {
    checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
    // Check results
-  checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
+    checkCudaErrors(
-                               nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
+        cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
    res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
    if (res == false) {
@ -279,21 +268,18 @@ void runTest(int argc, char **argv) {
        bTestResult = false;
    }
-  float bandwidthPL =
+    float bandwidthPL    = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
-      2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
+    float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timeArray / NUM_REPS);
  float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) /
                         (timeArray / NUM_REPS);
-  printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n",
+    printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n", bandwidthPL, bandwidthArray);
         bandwidthPL, bandwidthArray);
    float fetchRatePL    = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
    float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
-  printf(
+    printf("\nTexture fetch rate (Mpix/s) for pitch linear: "
      "\nTexture fetch rate (Mpix/s) for pitch linear: "
           "%.2e; for array: %.2e\n\n",
-      fetchRatePL, fetchRateArray);
+           fetchRatePL,
           fetchRateArray);
    // Cleanup
    free(h_idata);
--- a/Samples/0_Introduction/simplePrintf/simplePrintf.cu
+++ b/Samples/0_Introduction/simplePrintf/simplePrintf.cu
@ -26,28 +26,30 @@
 */
 // System includes
 #include <stdio.h>
 #include <assert.h>
 #include <stdio.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
 #endif
-__global__ void testKernel(int val) {
+__global__ void testKernel(int val)
-  printf("[%d, %d]:\t\tValue is:%d\n", blockIdx.y * gridDim.x + blockIdx.x,
+{
-         threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
+    printf("[%d, %d]:\t\tValue is:%d\n",
-             threadIdx.x,
+           blockIdx.y * gridDim.x + blockIdx.x,
           threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x,
           val);
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    int            devID;
    cudaDeviceProp props;
@ -57,8 +59,7 @@ int main(int argc, char **argv) {
    // Get GPU information
    checkCudaErrors(cudaGetDevice(&devID));
    checkCudaErrors(cudaGetDeviceProperties(&props, devID));
-  printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name,
+    printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name, props.major, props.minor);
         props.major, props.minor);
    printf("printf() is called. Output:\n\n");
--- a/Samples/0_Introduction/simpleStreams/simpleStreams.cu
+++ b/Samples/0_Introduction/simpleStreams/simpleStreams.cu
@ -48,24 +48,25 @@
 const char *sSDKsample = "simpleStreams";
-const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync",
+const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync", "cudaEventDisableTiming", NULL};
                                  "cudaEventDisableTiming", NULL};
-const char *sDeviceSyncMethod[] = {
+const char *sDeviceSyncMethod[] = {"cudaDeviceScheduleAuto",
-    "cudaDeviceScheduleAuto",         "cudaDeviceScheduleSpin",
+                                   "cudaDeviceScheduleSpin",
-    "cudaDeviceScheduleYield",        "INVALID",
+                                   "cudaDeviceScheduleYield",
-    "cudaDeviceScheduleBlockingSync", NULL};
+                                   "INVALID",
                                   "cudaDeviceScheduleBlockingSync",
                                   NULL};
 // System includes
 #include <stdio.h>
 #include <assert.h>
 #include <stdio.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #ifndef WIN32
 #include <sys/mman.h> // for mmap() / munmap()
@ -75,7 +76,8 @@ const char *sDeviceSyncMethod[] = {
 #define MEMORY_ALIGNMENT  4096
 #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
-__global__ void init_array(int *g_data, int *factor, int num_iterations) {
+__global__ void init_array(int *g_data, int *factor, int num_iterations)
 {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = 0; i < num_iterations; i++) {
@ -83,7 +85,8 @@ __global__ void init_array(int *g_data, int *factor, int num_iterations) {
    }
 }
-bool correct_data(int *a, const int n, const int c) {
+bool correct_data(int *a, const int n, const int c)
 {
    for (int i = 0; i < n; i++) {
        if (a[i] != c) {
            printf("%d: %d %d\n", i, a[i], c);
@ -94,51 +97,45 @@ bool correct_data(int *a, const int n, const int c) {
    return true;
 }
-inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a,
+inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
-                               int **ppAligned_a, int nbytes) {
+{
 #if CUDART_VERSION >= 4000
 #if !defined(__arm__) && !defined(__aarch64__)
    if (bPinGenericMemory) {
 // allocate a generic page-aligned chunk of system memory
 #ifdef WIN32
-    printf(
+        printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
        "> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
               "system memory)\n",
               (float)nbytes / 1048576.0f);
-    *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT),
+        *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
                                MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
 #else
-    printf(
+        printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system "
        "> mmap() allocating %4.2f Mbytes (generic page-aligned system "
               "memory)\n",
               (float)nbytes / 1048576.0f);
-    *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT),
+        *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
                        PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
 #endif
        *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);
-    printf(
+        printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
        "> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
               "system memory\n",
               (float)nbytes / 1048576.0f);
        // pin allocate memory
-    checkCudaErrors(
+        checkCudaErrors(cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
-        cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
+    }
-  } else
+    else
 #endif
 #endif
    {
-    printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n",
+        printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes / 1048576.0f);
           (float)nbytes / 1048576.0f);
        // allocate host memory (pinned is required for achieve asynchronicity)
        checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
        *ppAligned_a = *pp_a;
    }
 }
-inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
+inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
-                           int **ppAligned_a, int nbytes) {
+{
 #if CUDART_VERSION >= 4000
 #if !defined(__arm__) && !defined(__aarch64__)
    // CUDA 4.0 support pinning of generic host memory
@ -150,7 +147,8 @@ inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
 #else
        munmap(*pp_a, nbytes);
 #endif
-  } else
+    }
    else
 #endif
 #endif
    {
@ -158,26 +156,24 @@ inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
    }
 }
-static const char *sSyncMethod[] = {
+static const char *sSyncMethod[] = {"0 (Automatic Blocking)",
    "0 (Automatic Blocking)",
                                    "1 (Spin Blocking)",
                                    "2 (Yield Blocking)",
                                    "3 (Undefined Blocking Method)",
                                    "4 (Blocking Sync Event) = low CPU utilization",
                                    NULL};
-void printHelp() {
+void printHelp()
 {
    printf("Usage: %s [options below]\n", sSDKsample);
    printf("\t--sync_method=n for CPU/GPU synchronization\n");
    printf("\t             n=%s\n", sSyncMethod[0]);
    printf("\t             n=%s\n", sSyncMethod[1]);
    printf("\t             n=%s\n", sSyncMethod[2]);
    printf("\t   <Default> n=%s\n", sSyncMethod[4]);
-  printf(
+    printf("\t--use_generic_memory (default) use generic page-aligned for system "
      "\t--use_generic_memory (default) use generic page-aligned for system "
           "memory\n");
-  printf(
+    printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
      "\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
           "system memory\n");
 }
@ -187,7 +183,8 @@ void printHelp() {
 #define DEFAULT_PINNED_GENERIC_MEMORY true
 #endif
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    int   cuda_device = 0;
    int   nstreams    = 4;                        // number of streams for CUDA calls
    int   nreps       = 10;                       // number of times each experiment is repeated
@ -199,10 +196,8 @@ int main(int argc, char **argv) {
    // allocate generic memory and pin it laster instead of using cudaHostAlloc()
-  bool bPinGenericMemory =
+    bool bPinGenericMemory  = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior
-      DEFAULT_PINNED_GENERIC_MEMORY;  // we want this to be the default behavior
+    int  device_sync_method = cudaDeviceBlockingSync;        // by default we use BlockingSync
  int device_sync_method =
      cudaDeviceBlockingSync;  // by default we use BlockingSync
    int niterations; // number of iterations for the loop inside the kernel
@ -213,20 +208,18 @@ int main(int argc, char **argv) {
        return EXIT_SUCCESS;
    }
-  if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv,
+    if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv, "sync_method")) >= 0) {
-                                                  "sync_method")) >= 0) {
+        if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4) {
-    if (device_sync_method == 0 || device_sync_method == 1 ||
+            printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]);
        device_sync_method == 2 || device_sync_method == 4) {
      printf("Device synchronization method set to = %s\n",
             sSyncMethod[device_sync_method]);
            printf("Setting reps to 100 to demonstrate steady state\n");
            nreps = 100;
-    } else {
+        }
-      printf("Invalid command line option sync_method=\"%d\"\n",
+        else {
-             device_sync_method);
+            printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method);
            return EXIT_FAILURE;
        }
-  } else {
+    }
    else {
        printHelp();
        return EXIT_SUCCESS;
    }
@ -252,16 +245,13 @@ int main(int argc, char **argv) {
    checkCudaErrors(cudaGetDeviceCount(&num_devices));
    if (0 == num_devices) {
-    printf(
+        printf("your system does not have a CUDA capable device, waiving test...\n");
        "your system does not have a CUDA capable device, waiving test...\n");
        return EXIT_WAIVED;
    }
    // check if the command-line chosen device ID is within range, exit if not
    if (cuda_device >= num_devices) {
-    printf(
+        printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices - 1);
        "cuda_device=%d is invalid, must choose device ID between 0 and %d\n",
        cuda_device, num_devices - 1);
        return EXIT_FAILURE;
    }
@ -276,12 +266,10 @@ int main(int argc, char **argv) {
    // Check if GPU can map host memory (Generic Method), if not then we override
    // bPinGenericMemory to be false
    if (bPinGenericMemory) {
-    printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name,
+        printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No");
           deviceProp.canMapHostMemory ? "Yes" : "No");
        if (deviceProp.canMapHostMemory == 0) {
-      printf(
+            printf("Using cudaMallocHost, CUDA device does not support mapping of "
          "Using cudaMallocHost, CUDA device does not support mapping of "
                   "generic host memory\n");
            bPinGenericMemory = false;
        }
@ -289,27 +277,22 @@ int main(int argc, char **argv) {
    // Anything that is less than 32 Cores will have scaled down workload
    scale_factor =
-      max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
+        max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
                    (float)deviceProp.multiProcessorCount)),
            1.0f);
    n = (int)rint((float)n / scale_factor);
-  printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major,
+    printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
         deviceProp.minor);
    printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
           deviceProp.multiProcessorCount,
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
-         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
+           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
             deviceProp.multiProcessorCount);
    printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
    printf("> array_size   = %d\n\n", n);
    // enable use of blocking sync, to reduce CPU usage
-  printf("> Using CPU/GPU Device Synchronization method (%s)\n",
+    printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]);
-         sDeviceSyncMethod[device_sync_method]);
+    checkCudaErrors(cudaSetDeviceFlags(device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
  checkCudaErrors(cudaSetDeviceFlags(
      device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
    // allocate host memory
    int  c          = 5; // value to which the array will be initialized
@ -332,8 +315,7 @@ int main(int argc, char **argv) {
    printf("\nStarting Test\n");
    // allocate and initialize an array of stream handles
-  cudaStream_t *streams =
+    cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
      (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
    for (int i = 0; i < nstreams; i++) {
        checkCudaErrors(cudaStreamCreate(&(streams[i])));
@ -342,9 +324,7 @@ int main(int argc, char **argv) {
    // create CUDA event handles
    // use blocking sync
    cudaEvent_t start_event, stop_event;
-  int eventflags =
+    int eventflags = ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync : cudaEventDefault);
      ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync
                                                      : cudaEventDefault);
    checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
    checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
@ -354,11 +334,9 @@ int main(int argc, char **argv) {
                                                      // ensure that all previous
                                                      // CUDA calls have
                                                      // completed
-  checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes,
+    checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]));
                                  cudaMemcpyDeviceToHost, streams[0]));
    checkCudaErrors(cudaEventRecord(stop_event, 0));
-  checkCudaErrors(cudaEventSynchronize(
+    checkCudaErrors(cudaEventSynchronize(stop_event)); // block until the event is actually recorded
      stop_event));  // block until the event is actually recorded
    checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
    printf("memcopy:\t%.2f\n", time_memcpy);
@ -380,8 +358,7 @@ int main(int argc, char **argv) {
    for (int k = 0; k < nreps; k++) {
        init_array<<<blocks, threads>>>(d_a, d_c, niterations);
-    checkCudaErrors(
+        checkCudaErrors(cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
        cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
    }
    checkCudaErrors(cudaEventRecord(stop_event, 0));
@ -395,16 +372,14 @@ int main(int argc, char **argv) {
    blocks  = dim3(n / (nstreams * threads.x), 1);
    memset(hAligned_a, 255,
           nbytes);                              // set host memory bits to all 1s, for testing correctness
-  checkCudaErrors(cudaMemset(
+    checkCudaErrors(cudaMemset(d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness
      d_a, 0, nbytes));  // set device memory to all 0s, for testing correctness
    checkCudaErrors(cudaEventRecord(start_event, 0));
    for (int k = 0; k < nreps; k++) {
        // asynchronously launch nstreams kernels, each operating on its own portion
        // of data
        for (int i = 0; i < nstreams; i++) {
-      init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams,
+            init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_c, niterations);
                                                     d_c, niterations);
        }
        // asynchronously launch nstreams memcopies.  Note that memcopy in stream x
@ -413,8 +388,10 @@ int main(int argc, char **argv) {
        //   completed
        for (int i = 0; i < nstreams; i++) {
            checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
-                                      d_a + i * n / nstreams, nbytes / nstreams,
+                                            d_a + i * n / nstreams,
-                                      cudaMemcpyDeviceToHost, streams[i]));
+                                            nbytes / nstreams,
                                            cudaMemcpyDeviceToHost,
                                            streams[i]));
        }
    }
--- a/Samples/0_Introduction/simpleSurfaceWrite/simpleSurfaceWrite.cu
+++ b/Samples/0_Introduction/simpleSurfaceWrite/simpleSurfaceWrite.cu
@ -34,10 +34,10 @@
 */
 // Includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -73,23 +73,22 @@ static const char *sampleName = "simpleSurfaceWrite";
 //! Write to a cuArray (texture data source) using surface writes
 //! @param gIData input data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void surfaceWriteKernel(float *gIData, int width, int height,
+__global__ void surfaceWriteKernel(float *gIData, int width, int height, cudaSurfaceObject_t outputSurface)
-                                   cudaSurfaceObject_t outputSurface) {
+{
    // calculate surface coordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    // read from global memory and write to cuarray (via surface reference)
-  surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y,
+    surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y, cudaBoundaryModeTrap);
              cudaBoundaryModeTrap);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Transform an image using texture lookups
 //! @param gOData  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *gOData, int width, int height,
+__global__ void transformKernel(float *gOData, int width, int height, float theta, cudaTextureObject_t tex)
-                                float theta, cudaTextureObject_t tex) {
+{
    // calculate normalized texture coordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -111,29 +110,29 @@ __global__ void transformKernel(float *gOData, int width, int height,
 // Declaration, forward
 void runTest(int argc, char **argv);
-extern "C" void computeGold(float *reference, float *idata,
+extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
                            const unsigned int len);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("%s starting...\n", sampleName);
    // Process command-line arguments
    if (argc > 1) {
        if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
-      getCmdLineArgumentString(argc, (const char **)argv, "input",
+            getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);
                               (char **)&imageFilename);
            if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
-        getCmdLineArgumentString(argc, (const char **)argv, "reference",
+                getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
-                                 (char **)&refFilename);
+            }
-      } else {
+            else {
                printf("-input flag should be used with -reference flag");
                exit(EXIT_FAILURE);
            }
-    } else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
+        }
        else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
            printf("-reference flag should be used with -input flag");
            exit(EXIT_FAILURE);
        }
@ -141,15 +140,15 @@ int main(int argc, char **argv) {
    runTest(argc, argv);
-  printf("%s completed, returned %s\n", sampleName,
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
         testResult ? "OK" : "ERROR!");
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
 {
    // Use command-line specified CUDA device,
    // otherwise use device with highest Gflops/s
    int devID = findCudaDevice(argc, (const char **)argv);
@ -159,7 +158,9 @@ void runTest(int argc, char **argv) {
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n",
-         deviceProps.name, deviceProps.multiProcessorCount, deviceProps.major,
+           deviceProps.name,
           deviceProps.multiProcessorCount,
           deviceProps.major,
           deviceProps.minor);
    // Load image from disk
@ -193,11 +194,9 @@ void runTest(int argc, char **argv) {
    checkCudaErrors(cudaMalloc((void **)&dData, size));
    // Allocate array and copy image data
-  cudaChannelFormatDesc channelDesc =
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray            *cuArray;
-  checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height,
+    checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height, cudaArraySurfaceLoadStore));
                                  cudaArraySurfaceLoadStore));
    dim3 dimBlock(8, 8, 1);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
@ -211,11 +210,9 @@ void runTest(int argc, char **argv) {
    checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
 #if 1
    checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
-  surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height,
+    surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, outputSurface);
                                            outputSurface);
 #else // This is what differs from the example simpleTexture
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
      cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
 #endif
    cudaTextureObject_t tex;
@ -254,8 +251,7 @@ void runTest(int argc, char **argv) {
    cudaDeviceSynchronize();
    sdkStopTimer(&timer);
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  printf("%.2f Mpixels/sec\n",
+    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
         (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
    sdkDeleteTimer(&timer);
    // Allocate mem for the result on host side
@ -272,9 +268,9 @@ void runTest(int argc, char **argv) {
    // Write regression file if necessary
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
        // Write file for regression test
-    sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f,
+        sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f, false);
-                        false);
+    }
-  } else {
+    else {
        // We need to reload the data from disk,
        // because it is inverted upon output
        sdkLoadPGM(outputFilename, &hOData, &width, &height);
@ -282,8 +278,7 @@ void runTest(int argc, char **argv) {
        printf("Comparing files\n");
        printf("\toutput:    <%s>\n", outputFilename);
        printf("\treference: <%s>\n", refPath);
-    testResult =
+        testResult = compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
        compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
    }
    checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
--- a/Samples/0_Introduction/simpleTemplates/sharedmem.cuh
+++ b/Samples/0_Introduction/simpleTemplates/sharedmem.cuh
@ -68,10 +68,11 @@
 // this
 // struct by putting an undefined symbol in the function body so it won't
 // compile.
-template <typename T>
+template <typename T> struct SharedMemory
-struct SharedMemory {
+{
    // Ensure that we won't compile any un-specialized types
-  __device__ T *getPointer() {
+    __device__ T *getPointer()
    {
        extern __device__ void error(void);
        error();
        return NULL;
@ -82,89 +83,100 @@ struct SharedMemory {
 // int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
 // One could also specialize it for user-defined types.
-template <>
+template <> struct SharedMemory<int>
-struct SharedMemory<int> {
+{
-  __device__ int *getPointer() {
+    __device__ int *getPointer()
    {
        extern __shared__ int s_int[];
        return s_int;
    }
 };
-template <>
+template <> struct SharedMemory<unsigned int>
-struct SharedMemory<unsigned int> {
+{
-  __device__ unsigned int *getPointer() {
+    __device__ unsigned int *getPointer()
    {
        extern __shared__ unsigned int s_uint[];
        return s_uint;
    }
 };
-template <>
+template <> struct SharedMemory<char>
-struct SharedMemory<char> {
+{
-  __device__ char *getPointer() {
+    __device__ char *getPointer()
    {
        extern __shared__ char s_char[];
        return s_char;
    }
 };
-template <>
+template <> struct SharedMemory<unsigned char>
-struct SharedMemory<unsigned char> {
+{
-  __device__ unsigned char *getPointer() {
+    __device__ unsigned char *getPointer()
    {
        extern __shared__ unsigned char s_uchar[];
        return s_uchar;
    }
 };
-template <>
+template <> struct SharedMemory<short>
-struct SharedMemory<short> {
+{
-  __device__ short *getPointer() {
+    __device__ short *getPointer()
    {
        extern __shared__ short s_short[];
        return s_short;
    }
 };
-template <>
+template <> struct SharedMemory<unsigned short>
-struct SharedMemory<unsigned short> {
+{
-  __device__ unsigned short *getPointer() {
+    __device__ unsigned short *getPointer()
    {
        extern __shared__ unsigned short s_ushort[];
        return s_ushort;
    }
 };
-template <>
+template <> struct SharedMemory<long>
-struct SharedMemory<long> {
+{
-  __device__ long *getPointer() {
+    __device__ long *getPointer()
    {
        extern __shared__ long s_long[];
        return s_long;
    }
 };
-template <>
+template <> struct SharedMemory<unsigned long>
-struct SharedMemory<unsigned long> {
+{
-  __device__ unsigned long *getPointer() {
+    __device__ unsigned long *getPointer()
    {
        extern __shared__ unsigned long s_ulong[];
        return s_ulong;
    }
 };
-template <>
+template <> struct SharedMemory<bool>
-struct SharedMemory<bool> {
+{
-  __device__ bool *getPointer() {
+    __device__ bool *getPointer()
    {
        extern __shared__ bool s_bool[];
        return s_bool;
    }
 };
-template <>
+template <> struct SharedMemory<float>
-struct SharedMemory<float> {
+{
-  __device__ float *getPointer() {
+    __device__ float *getPointer()
    {
        extern __shared__ float s_float[];
        return s_float;
    }
 };
-template <>
+template <> struct SharedMemory<double>
-struct SharedMemory<double> {
+{
-  __device__ double *getPointer() {
+    __device__ double *getPointer()
    {
        extern __shared__ double s_double[];
        return s_double;
    }
--- a/Samples/0_Introduction/simpleTemplates/simpleTemplates.cu
+++ b/Samples/0_Introduction/simpleTemplates/simpleTemplates.cu
@ -32,17 +32,17 @@
 */
 // System includes
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
@ -58,8 +58,8 @@ int g_TotalFailures = 0;
 //! @param g_idata  input data in global memory
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-template <class T>
+template <class T> __global__ void testKernel(T *g_idata, T *g_odata)
-__global__ void testKernel(T *g_idata, T *g_odata) {
+{
    // Shared mem size is determined by the host app at run time
    SharedMemory<T> smem;
    T              *sdata = smem.getPointer();
@ -83,11 +83,10 @@ __global__ void testKernel(T *g_idata, T *g_odata) {
 ////////////////////////////////////////////////////////////////////////////////
 // declaration, forward
-template <class T>
+template <class T> void runTest(int argc, char **argv, int len);
 void runTest(int argc, char **argv, int len);
-template <class T>
+template <class T> void computeGold(T *reference, T *idata, const unsigned int len)
-void computeGold(T *reference, T *idata, const unsigned int len) {
+{
    const T T_len = static_cast<T>(len);
    for (unsigned int i = 0; i < len; ++i) {
@ -98,7 +97,8 @@ void computeGold(T *reference, T *idata, const unsigned int len) {
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("> runTest<float,32>\n");
    runTest<float>(argc, argv, 32);
    printf("> runTest<int,64>\n");
@ -114,60 +114,63 @@ int main(int argc, char **argv) {
 // functions for different types.
 // Here's the generic wrapper for cutCompare*
-template <class T>
+template <class T> class ArrayComparator
-class ArrayComparator {
+{
 public:
-  bool compare(const T *reference, T *data, unsigned int len) {
+    bool compare(const T *reference, T *data, unsigned int len)
-    fprintf(stderr,
+    {
-            "Error: no comparison function implemented for this type\n");
+        fprintf(stderr, "Error: no comparison function implemented for this type\n");
        return false;
    }
 };
 // Here's the specialization for ints:
-template <>
+template <> class ArrayComparator<int>
-class ArrayComparator<int> {
+{
 public:
-  bool compare(const int *reference, int *data, unsigned int len) {
+    bool compare(const int *reference, int *data, unsigned int len)
    {
        return compareData(reference, data, len, 0.15f, 0.0f);
    }
 };
 // Here's the specialization for floats:
-template <>
+template <> class ArrayComparator<float>
-class ArrayComparator<float> {
+{
 public:
-  bool compare(const float *reference, float *data, unsigned int len) {
+    bool compare(const float *reference, float *data, unsigned int len)
    {
        return compareData(reference, data, len, 0.15f, 0.15f);
    }
 };
 // Here's the generic wrapper for cutWriteFile*
-template <class T>
+template <class T> class ArrayFileWriter
-class ArrayFileWriter {
+{
 public:
-  bool write(const char *filename, T *data, unsigned int len, float epsilon) {
+    bool write(const char *filename, T *data, unsigned int len, float epsilon)
-    fprintf(stderr,
+    {
-            "Error: no file write function implemented for this type\n");
+        fprintf(stderr, "Error: no file write function implemented for this type\n");
        return false;
    }
 };
 // Here's the specialization for ints:
-template <>
+template <> class ArrayFileWriter<int>
-class ArrayFileWriter<int> {
+{
 public:
-  bool write(const char *filename, int *data, unsigned int len, float epsilon) {
+    bool write(const char *filename, int *data, unsigned int len, float epsilon)
    {
        return sdkWriteFile(filename, data, len, epsilon, false);
    }
 };
 // Here's the specialization for floats:
-template <>
+template <> class ArrayFileWriter<float>
-class ArrayFileWriter<float> {
+{
 public:
-  bool write(const char *filename, float *data, unsigned int len,
+    bool write(const char *filename, float *data, unsigned int len, float epsilon)
-             float epsilon) {
+    {
        return sdkWriteFile(filename, data, len, epsilon, false);
    }
 };
@ -175,8 +178,8 @@ class ArrayFileWriter<float> {
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-template <class T>
+template <class T> void runTest(int argc, char **argv, int len)
-void runTest(int argc, char **argv, int len) {
+{
    int            devID;
    cudaDeviceProp deviceProps;
@ -184,8 +187,7 @@ void runTest(int argc, char **argv, int len) {
    // get number of SMs on this GPU
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-  printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name,
+    printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);
         deviceProps.multiProcessorCount);
    // create and start timer
    StopWatchInterface *timer = NULL;
@ -209,8 +211,7 @@ void runTest(int argc, char **argv, int len) {
    T *d_idata;
    checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
    // copy host memory to device
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
      cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
    // allocate device memory for result
    T *d_odata;
@ -229,8 +230,7 @@ void runTest(int argc, char **argv, int len) {
    // allocate mem for the result on host side
    T *h_odata = (T *)malloc(mem_size);
    // copy result from device to host
-  checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads,
+    checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads, cudaMemcpyDeviceToHost));
                             cudaMemcpyDeviceToHost));
    sdkStopTimer(&timer);
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
@ -247,7 +247,8 @@ void runTest(int argc, char **argv, int len) {
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
        // write file for regression test
        writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);
-  } else {
+    }
    else {
        // custom output handling when no regression test running
        // in this case check if the result is equivalent to the expected solution
        bool res = comparator.compare(reference, h_odata, num_threads);
--- a/Samples/0_Introduction/simpleTexture/simpleTexture.cu
+++ b/Samples/0_Introduction/simpleTexture/simpleTexture.cu
@ -34,10 +34,10 @@
 */
 // Includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -73,8 +73,8 @@ bool testResult = true;
 //! Transform an image using texture lookups
 //! @param outputData  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *outputData, int width, int height,
+__global__ void transformKernel(float *outputData, int width, int height, float theta, cudaTextureObject_t tex)
-                                float theta, cudaTextureObject_t tex) {
+{
    // calculate normalized texture coordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -98,23 +98,24 @@ void runTest(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("%s starting...\n", sampleName);
    // Process command-line arguments
    if (argc > 1) {
        if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
-      getCmdLineArgumentString(argc, (const char **)argv, "input",
+            getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);
                               (char **)&imageFilename);
            if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
-        getCmdLineArgumentString(argc, (const char **)argv, "reference",
+                getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
-                                 (char **)&refFilename);
+            }
-      } else {
+            else {
                printf("-input flag should be used with -reference flag");
                exit(EXIT_FAILURE);
            }
-    } else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
+        }
        else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
            printf("-reference flag should be used with -input flag");
            exit(EXIT_FAILURE);
        }
@ -122,15 +123,15 @@ int main(int argc, char **argv) {
    runTest(argc, argv);
-  printf("%s completed, returned %s\n", sampleName,
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
         testResult ? "OK" : "ERROR!");
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
 {
    int devID = findCudaDevice(argc, (const char **)argv);
    // load image from disk
@ -164,12 +165,10 @@ void runTest(int argc, char **argv) {
    checkCudaErrors(cudaMalloc((void **)&dData, size));
    // Allocate array and copy image data
-  cudaChannelFormatDesc channelDesc =
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray            *cuArray;
    checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height));
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
      cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
    cudaTextureObject_t tex;
    cudaResourceDesc    texRes;
@ -209,8 +208,7 @@ void runTest(int argc, char **argv) {
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer);
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  printf("%.2f Mpixels/sec\n",
+    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
         (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
    sdkDeleteTimer(&timer);
    // Allocate mem for the result on host side
@ -228,9 +226,9 @@ void runTest(int argc, char **argv) {
    // Write regression file if necessary
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
        // Write file for regression test
-    sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height,
+        sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height, 0.0f, false);
-                        0.0f, false);
+    }
-  } else {
+    else {
        // We need to reload the data from disk,
        // because it is inverted upon output
        sdkLoadPGM(outputFilename, &hOutputData, &width, &height);
@ -239,8 +237,7 @@ void runTest(int argc, char **argv) {
        printf("\toutput:    <%s>\n", outputFilename);
        printf("\treference: <%s>\n", refPath);
-    testResult = compareData(hOutputData, hDataRef, width * height,
+        testResult = compareData(hOutputData, hDataRef, width * height, MAX_EPSILON_ERROR, 0.15f);
                             MAX_EPSILON_ERROR, 0.15f);
    }
    checkCudaErrors(cudaDestroyTextureObject(tex));
--- a/Samples/0_Introduction/simpleTexture3D/README.md
+++ b/Samples/0_Introduction/simpleTexture3D/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## References (for more details)
--- a/Samples/0_Introduction/simpleTexture3D/simpleTexture3D.cpp
+++ b/Samples/0_Introduction/simpleTexture3D/simpleTexture3D.cpp
@ -32,11 +32,11 @@
  using 3D texture lookups.
 */
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <helper_gl.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #if defined(__APPLE__) || defined(MACOSX)
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
@ -49,9 +49,9 @@
 #endif
 // includes, cuda
 #include <vector_types.h>
 #include <cuda_runtime.h>
 #include <cuda_gl_interop.h>
 #include <cuda_runtime.h>
 #include <vector_types.h>
 // CUDA utilities and system includes
 #include <helper_cuda.h>
@ -76,8 +76,7 @@ const dim3 gridSize(width / blockSize.x, height / blockSize.y);
 float w = 0.5; // texture coordinate in z
 GLuint                       pbo;               // OpenGL pixel buffer object
-struct cudaGraphicsResource
+struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)
    *cuda_pbo_resource;  // CUDA Graphics Resource (to transfer PBO)
 bool linearFiltering = true;
 bool animate         = true;
@ -105,13 +104,13 @@ char **pArgv = NULL;
 extern "C" void cleanup();
 extern "C" void setTextureFilterMode(bool bLinearFilter);
 extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
-extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output,
+extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w);
                              uint imageW, uint imageH, float w);
 extern void     cleanupCuda();
 void loadVolumeData(char *exec_path);
-void computeFPS() {
+void computeFPS()
 {
    frameCount++;
    fpsCount++;
@ -129,13 +128,13 @@ void computeFPS() {
 }
 // render image using CUDA
-void render() {
+void render()
 {
    // map PBO to get CUDA device pointer
    g_GraphicsMapFlag++;
    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
    size_t num_bytes;
-  checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
+    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource));
      (void **)&d_output, &num_bytes, cuda_pbo_resource));
    // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);
    // call CUDA kernel, writing results to PBO
@ -150,7 +149,8 @@ void render() {
 }
 // display results using OpenGL (called by GLUT)
-void display() {
+void display()
 {
    sdkStartTimer(&timer);
    render();
@ -172,14 +172,16 @@ void display() {
    computeFPS();
 }
-void idle() {
+void idle()
 {
    if (animate) {
        w += 0.01f;
        glutPostRedisplay();
    }
 }
-void keyboard(unsigned char key, int x, int y) {
+void keyboard(unsigned char key, int x, int y)
 {
    switch (key) {
    case 27:
 #if defined(__APPLE__) || defined(MACOSX)
@ -216,7 +218,8 @@ void keyboard(unsigned char key, int x, int y) {
    glutPostRedisplay();
 }
-void reshape(int x, int y) {
+void reshape(int x, int y)
 {
    glViewport(0, 0, x, y);
    glMatrixMode(GL_MODELVIEW);
@ -227,7 +230,8 @@ void reshape(int x, int y) {
    glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
 }
-void cleanup() {
+void cleanup()
 {
    sdkDeleteTimer(&timer);
    // add extra check to unmap the resource before unregistering it
@ -242,21 +246,21 @@ void cleanup() {
    cleanupCuda();
 }
-void initGLBuffers() {
+void initGLBuffers()
 {
    // create pixel buffer object
    glGenBuffers(1, &pbo);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
-  glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4,
+    glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, 0, GL_STREAM_DRAW_ARB);
               0, GL_STREAM_DRAW_ARB);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
    // register this buffer object with CUDA
-  checkCudaErrors(cudaGraphicsGLRegisterBuffer(
+    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
      &cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
 }
 // Load raw data from disk
-uchar *loadRawFile(const char *filename, size_t size) {
+uchar *loadRawFile(const char *filename, size_t size)
 {
    FILE *fp = fopen(filename, "rb");
    if (!fp) {
@ -273,7 +277,8 @@ uchar *loadRawFile(const char *filename, size_t size) {
    return data;
 }
-void initGL(int *argc, char **argv) {
+void initGL(int *argc, char **argv)
 {
    // initialize GLUT callback functions
    glutInit(argc, argv);
    glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
@ -284,16 +289,15 @@ void initGL(int *argc, char **argv) {
    glutReshapeFunc(reshape);
    glutIdleFunc(idle);
-  if (!isGLVersionSupported(2, 0) ||
+    if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
      !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
        fprintf(stderr, "Required OpenGL extensions are missing.");
        exit(EXIT_FAILURE);
    }
 }
-void runAutoTest(const char *ref_file, char *exec_path) {
+void runAutoTest(const char *ref_file, char *exec_path)
-  checkCudaErrors(
+{
-      cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));
+    checkCudaErrors(cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));
    // render the volumeData
    render_kernel(gridSize, blockSize, d_output, width, height, w);
@ -302,15 +306,15 @@ void runAutoTest(const char *ref_file, char *exec_path) {
    getLastCudaError("render_kernel failed");
    void *h_output = malloc(width * height * sizeof(GLubyte) * 4);
-  checkCudaErrors(cudaMemcpy(h_output, d_output,
+    checkCudaErrors(cudaMemcpy(h_output, d_output, width * height * sizeof(GLubyte) * 4, cudaMemcpyDeviceToHost));
-                             width * height * sizeof(GLubyte) * 4,
+    sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4, "simpleTexture3D.bin");
                             cudaMemcpyDeviceToHost));
  sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4,
             "simpleTexture3D.bin");
-  bool bTestResult = sdkCompareBin2BinFloat(
+    bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin",
-      "simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path),
+                                              sdkFindFilePath(ref_file, exec_path),
-      width * height, MAX_EPSILON_ERROR, THRESHOLD, exec_path);
+                                              width * height,
                                              MAX_EPSILON_ERROR,
                                              THRESHOLD,
                                              exec_path);
    checkCudaErrors(cudaFree(d_output));
    free(h_output);
@ -321,13 +325,13 @@ void runAutoTest(const char *ref_file, char *exec_path) {
    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-void loadVolumeData(char *exec_path) {
+void loadVolumeData(char *exec_path)
 {
    // load volume data
    const char *path = sdkFindFilePath(volumeFilename, exec_path);
    if (path == NULL) {
-    fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n",
+        fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
            volumeFilename);
        exit(EXIT_FAILURE);
    }
@ -343,7 +347,8 @@ void loadVolumeData(char *exec_path) {
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    pArgc = &argc;
    pArgv = argv;
@ -367,7 +372,8 @@ int main(int argc, char **argv) {
    if (ref_file) {
        loadVolumeData(argv[0]);
        runAutoTest(ref_file, argv[0]);
-  } else {
+    }
    else {
        initGL(&argc, argv);
        // OpenGL buffers
@ -376,8 +382,7 @@ int main(int argc, char **argv) {
        loadVolumeData(argv[0]);
    }
-  printf(
+    printf("Press space to toggle animation\n"
      "Press space to toggle animation\n"
           "Press '+' and '-' to change displayed slice\n");
 #if defined(__APPLE__) || defined(MACOSX)
--- a/Samples/0_Introduction/simpleTexture3D/simpleTexture3D_kernel.cu
+++ b/Samples/0_Introduction/simpleTexture3D/simpleTexture3D_kernel.cu
@ -28,13 +28,12 @@
 #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
 #define _SIMPLETEXTURE3D_KERNEL_CU_
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <helper_cuda.h>
 #include <helper_math.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 typedef unsigned int  uint;
 typedef unsigned char uchar;
@ -42,8 +41,8 @@ typedef unsigned char uchar;
 cudaArray          *d_volumeArray = 0;
 cudaTextureObject_t tex; // 3D texture
-__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w,
+__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w, cudaTextureObject_t texObj)
-                         cudaTextureObject_t texObj) {
+{
    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
@ -59,7 +58,8 @@ __global__ void d_render(uint *d_output, uint imageW, uint imageH, float w,
    }
 }
-extern "C" void setTextureFilterMode(bool bLinearFilter) {
+extern "C" void setTextureFilterMode(bool bLinearFilter)
 {
    if (tex) {
        checkCudaErrors(cudaDestroyTextureObject(tex));
    }
@ -73,8 +73,7 @@ extern "C" void setTextureFilterMode(bool bLinearFilter) {
    memset(&texDescr, 0, sizeof(cudaTextureDesc));
    texDescr.normalizedCoords = true;
-  texDescr.filterMode =
+    texDescr.filterMode       = bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
      bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
    ;
    texDescr.addressMode[0] = cudaAddressModeWrap;
    texDescr.addressMode[1] = cudaAddressModeWrap;
@ -84,7 +83,8 @@ extern "C" void setTextureFilterMode(bool bLinearFilter) {
    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 }
-extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) {
+extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize)
 {
    // create 3D array
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
    checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
@ -92,8 +92,7 @@ extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) {
    // copy data to 3D array
    cudaMemcpy3DParms copyParams = {0};
    copyParams.srcPtr =
-      make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar),
+        make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar), volumeSize.width, volumeSize.height);
                          volumeSize.width, volumeSize.height);
    copyParams.dstArray = d_volumeArray;
    copyParams.extent   = volumeSize;
    copyParams.kind     = cudaMemcpyHostToDevice;
@ -121,12 +120,13 @@ extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) {
    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 }
-extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output,
+extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w)
-                              uint imageW, uint imageH, float w) {
+{
    d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex);
 }
-void cleanupCuda() {
+void cleanupCuda()
 {
    if (tex) {
        checkCudaErrors(cudaDestroyTextureObject(tex));
    }
--- a/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp
+++ b/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp
@ -39,16 +39,16 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <iostream>
 #include <cstring>
 #include <iostream>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes, CUDA
 #include <cuda.h>
 #include <builtin_types.h>
 #include <cuda.h>
 // includes, project
 #include <helper_cuda_drvapi.h>
 #include <helper_functions.h>
@ -65,8 +65,7 @@ float angle = 0.5f;  // angle to rotate image by (in radians)
 // declaration, forward
 void runTest(int argc, char **argv);
-extern "C" void computeGold(float *reference, float *idata,
+extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
                            const unsigned int len);
 static CUresult initCUDA(int argc, char **argv, CUfunction *);
@ -84,7 +83,8 @@ CUdevice cuDevice;
 CUcontext cuContext;
 CUmodule  cuModule;
-void showHelp() {
+void showHelp()
 {
    printf("\n> [%s] Command line options\n", sSDKsample);
    printf("\t-device=n          (where n=0,1,2.... for the GPU device)\n\n");
 }
@ -92,7 +92,8 @@ void showHelp() {
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
        showHelp();
        return 0;
@ -104,7 +105,8 @@ int main(int argc, char **argv) {
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
 {
    bool bTestResults = true;
    // initialize CUDA
@ -191,18 +193,17 @@ void runTest(int argc, char **argv) {
        // Launching (simpler method)
        void *args[5] = {&d_data, &width, &height, &angle, &TexObject};
-    checkCudaErrors(cuLaunchKernel(transform, (width / block_size),
+        checkCudaErrors(cuLaunchKernel(
-                                   (height / block_size), 1, block_size,
+            transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
                                   block_size, 1, 0, NULL, args, NULL));
        checkCudaErrors(cuCtxSynchronize());
        sdkCreateTimer(&timer);
        sdkStartTimer(&timer);
        // launch kernel again for performance measurement
-    checkCudaErrors(cuLaunchKernel(transform, (width / block_size),
+        checkCudaErrors(cuLaunchKernel(
-                                   (height / block_size), 1, block_size,
+            transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
-                                   block_size, 1, 0, NULL, args, NULL));
+    }
-  } else {
+    else {
        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
        // Launching (advanced method)
        int  offset = 0;
@ -222,29 +223,43 @@ void runTest(int argc, char **argv) {
        *((CUtexObject *)&argBuffer[offset]) = TexObject;
        offset += sizeof(TexObject);
-    void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
+        void *kernel_launch_config[5] = {
-                                     CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
+            CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
                                     CU_LAUNCH_PARAM_END};
        // new CUDA 4.0 Driver API Kernel launch call (warmup)
-    checkCudaErrors(cuLaunchKernel(
+        checkCudaErrors(cuLaunchKernel(transform,
-        transform, (width / block_size), (height / block_size), 1, block_size,
+                                       (width / block_size),
-        block_size, 1, 0, NULL, NULL, (void **)&kernel_launch_config));
+                                       (height / block_size),
                                       1,
                                       block_size,
                                       block_size,
                                       1,
                                       0,
                                       NULL,
                                       NULL,
                                       (void **)&kernel_launch_config));
        checkCudaErrors(cuCtxSynchronize());
        sdkCreateTimer(&timer);
        sdkStartTimer(&timer);
        // launch kernel again for performance measurement
-    checkCudaErrors(cuLaunchKernel(
+        checkCudaErrors(cuLaunchKernel(transform,
-        transform, (width / block_size), (height / block_size), 1, block_size,
+                                       (width / block_size),
-        block_size, 1, 0, 0, NULL, (void **)&kernel_launch_config));
+                                       (height / block_size),
                                       1,
                                       block_size,
                                       block_size,
                                       1,
                                       0,
                                       0,
                                       NULL,
                                       (void **)&kernel_launch_config));
    }
    checkCudaErrors(cuCtxSynchronize());
    sdkStopTimer(&timer);
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  printf("%.2f Mpixels/sec\n",
+    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
         (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
    sdkDeleteTimer(&timer);
    // allocate mem for the result on host side
@ -262,17 +277,16 @@ void runTest(int argc, char **argv) {
    // write regression file if necessary
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
        // write file for regression test
-    sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f,
+        sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
-                        false);
+    }
-  } else {
+    else {
        // We need to reload the data from disk, because it is inverted upon output
        sdkLoadPGM(output_filename, &h_odata, &width, &height);
        printf("Comparing files\n");
        printf("\toutput:    <%s>\n", output_filename);
        printf("\treference: <%s>\n", ref_path);
-    bTestResults = compareData(h_odata, h_data_ref, width * height,
+        bTestResults = compareData(h_odata, h_data_ref, width * height, MIN_EPSILON_ERROR, 0.15f);
                               MIN_EPSILON_ERROR, 0.15f);
    }
    // cleanup memory
@ -293,7 +307,8 @@ void runTest(int argc, char **argv) {
 //! kernel function.  After the module is loaded, cuModuleGetFunction
 //! retrieves the CUDA function pointer "cuFunction"
 ////////////////////////////////////////////////////////////////////////////////
-static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
+static CUresult initCUDA(int argc, char **argv, CUfunction *transform)
 {
    CUfunction cuFunction = 0;
    int        major = 0, minor = 0, devID = 0;
    char       deviceName[100];
@ -302,10 +317,8 @@ static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
    // get compute capabilities and the devicename
-  checkCudaErrors(cuDeviceGetAttribute(
+    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
-      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
  checkCudaErrors(cuDeviceGetAttribute(
      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
    checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
    printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
@ -316,7 +329,8 @@ static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
        exit(EXIT_FAILURE);
-  } else {
+    }
    else {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }
@ -328,8 +342,7 @@ static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
    // Create module from binary file (FATBIN)
    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
-  checkCudaErrors(
+    checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
      cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
    *transform = cuFunction;
--- a/Samples/0_Introduction/simpleTextureDrv/simpleTexture_kernel.cu
+++ b/Samples/0_Introduction/simpleTextureDrv/simpleTexture_kernel.cu
@ -33,9 +33,8 @@
 //! Transform an image using texture lookups
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ void transformKernel(float *g_odata, int width,
+extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta, CUtexObject tex)
-                                           int height, float theta,
+{
                                           CUtexObject tex) {
    // calculate normalized texture coordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
--- a/Samples/0_Introduction/simpleVoteIntrinsics/simpleVoteIntrinsics.cu
+++ b/Samples/0_Introduction/simpleVoteIntrinsics/simpleVoteIntrinsics.cu
@ -53,7 +53,8 @@ static const char *sSDKsample = "[simpleVoteIntrinsics]\0";
 #include "simpleVote_kernel.cuh"
 // Generate the test pattern for Tests 1 and 2
-void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) {
+void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
 {
    // For testing VOTE.Any (all of these threads will return 0)
    for (int i = 0; i < size / 4; i++) {
        VOTE_PATTERN[i] = 0x00000000;
@ -75,8 +76,8 @@ void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) {
    }
 }
-int checkErrors1(unsigned int *h_result, int start, int end, int warp_size,
+int checkErrors1(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
-                 const char *voteType) {
+{
    int i, sum = 0;
    for (sum = 0, i = start; i < end; i++) {
@ -96,8 +97,8 @@ int checkErrors1(unsigned int *h_result, int start, int end, int warp_size,
    return (sum > 0);
 }
-int checkErrors2(unsigned int *h_result, int start, int end, int warp_size,
+int checkErrors2(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
-                 const char *voteType) {
+{
    int i, sum = 0;
    for (sum = 0, i = start; i < end; i++) {
@ -118,49 +119,42 @@ int checkErrors2(unsigned int *h_result, int start, int end, int warp_size,
 }
 // Verification code for Kernel #1
-int checkResultsVoteAnyKernel1(unsigned int *h_result, int size,
+int checkResultsVoteAnyKernel1(unsigned int *h_result, int size, int warp_size)
-                               int warp_size) {
+{
    int error_count = 0;
-  error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4,
+    error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
-                              warp_size, "Vote.Any");
+    error_count += checkErrors2(
-  error_count +=
+        h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
-      checkErrors2(h_result, VOTE_DATA_GROUP * warp_size / 4,
+    error_count += checkErrors2(
-                   2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
+        h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
-  error_count +=
+    error_count += checkErrors2(
-      checkErrors2(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4,
+        h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
                   3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
  error_count +=
      checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
                   4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
    printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
    return error_count;
 }
 // Verification code for Kernel #2
-int checkResultsVoteAllKernel2(unsigned int *h_result, int size,
+int checkResultsVoteAllKernel2(unsigned int *h_result, int size, int warp_size)
-                               int warp_size) {
+{
    int error_count = 0;
-  error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4,
+    error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
-                              warp_size, "Vote.All");
+    error_count += checkErrors1(
-  error_count +=
+        h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
-      checkErrors1(h_result, VOTE_DATA_GROUP * warp_size / 4,
+    error_count += checkErrors1(
-                   2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
+        h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
-  error_count +=
+    error_count += checkErrors2(
-      checkErrors1(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4,
+        h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
                   3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
  error_count +=
      checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
                   4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
    printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
    return error_count;
 }
 // Verification code for Kernel #3
-int checkResultsVoteAnyKernel3(bool *hinfo, int size) {
+int checkResultsVoteAnyKernel3(bool *hinfo, int size)
 {
    int i, error_count = 0;
    for (i = 0; i < size * 3; i++) {
@ -198,7 +192,8 @@ int checkResultsVoteAnyKernel3(bool *hinfo, int size) {
    return error_count;
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    unsigned int *h_input, *h_result;
    unsigned int *d_input, *d_result;
@ -216,24 +211,20 @@ int main(int argc, char **argv) {
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
    // Statistics about the GPU device
-  printf(
+    printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
-      "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
+           deviceProp.multiProcessorCount,
-      deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
+           deviceProp.major,
           deviceProp.minor);
-  h_input = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size *
+    h_input  = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
-                                   sizeof(unsigned int));
+    h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
  h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size *
                                    sizeof(unsigned int));
    checkCudaErrors(
-      cudaMalloc(reinterpret_cast<void **>(&d_input),
+        cudaMalloc(reinterpret_cast<void **>(&d_input), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
                 VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
    checkCudaErrors(
-      cudaMalloc(reinterpret_cast<void **>(&d_result),
+        cudaMalloc(reinterpret_cast<void **>(&d_result), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
                 VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
    genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size);
-  checkCudaErrors(cudaMemcpy(d_input, h_input,
+    checkCudaErrors(
-                             VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
+        cudaMemcpy(d_input, h_input, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyHostToDevice));
                             cudaMemcpyHostToDevice));
    // Start of Vote Any Test Kernel #1
    printf("[VOTE Kernel Test 1/3]\n");
@ -242,16 +233,13 @@ int main(int argc, char **argv) {
        checkCudaErrors(cudaDeviceSynchronize());
        dim3 gridBlock(1, 1);
        dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
-    VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result,
+        VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
                                               VOTE_DATA_GROUP * warp_size);
        getLastCudaError("VoteAnyKernel() execution failed\n");
        checkCudaErrors(cudaDeviceSynchronize());
    }
-  checkCudaErrors(cudaMemcpy(h_result, d_result,
+    checkCudaErrors(
-                             VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
+        cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-                             cudaMemcpyDeviceToHost));
+    error_count[0] += checkResultsVoteAnyKernel1(h_result, VOTE_DATA_GROUP * warp_size, warp_size);
  error_count[0] += checkResultsVoteAnyKernel1(
      h_result, VOTE_DATA_GROUP * warp_size, warp_size);
    // Start of Vote All Test Kernel #2
    printf("\n[VOTE Kernel Test 2/3]\n");
@ -260,23 +248,18 @@ int main(int argc, char **argv) {
        checkCudaErrors(cudaDeviceSynchronize());
        dim3 gridBlock(1, 1);
        dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
-    VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result,
+        VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
                                               VOTE_DATA_GROUP * warp_size);
        getLastCudaError("VoteAllKernel() execution failed\n");
        checkCudaErrors(cudaDeviceSynchronize());
    }
-  checkCudaErrors(cudaMemcpy(h_result, d_result,
+    checkCudaErrors(
-                             VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
+        cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-                             cudaMemcpyDeviceToHost));
+    error_count[1] += checkResultsVoteAllKernel2(h_result, VOTE_DATA_GROUP * warp_size, warp_size);
  error_count[1] += checkResultsVoteAllKernel2(
      h_result, VOTE_DATA_GROUP * warp_size, warp_size);
    // Second Vote Kernel Test #3 (both Any/All)
    hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool)));
-  cudaMalloc(reinterpret_cast<void **>(&dinfo),
+    cudaMalloc(reinterpret_cast<void **>(&dinfo), warp_size * 3 * 3 * sizeof(bool));
-             warp_size * 3 * 3 * sizeof(bool));
+    cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyHostToDevice);
  cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool),
             cudaMemcpyHostToDevice);
    printf("\n[VOTE Kernel Test 3/3]\n");
    printf("\tRunning <<Vote.Any>> kernel3 ...\n");
@ -286,8 +269,7 @@ int main(int argc, char **argv) {
        checkCudaErrors(cudaDeviceSynchronize());
    }
-  cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool),
+    cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyDeviceToHost);
             cudaMemcpyDeviceToHost);
    error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3);
@ -303,7 +285,5 @@ int main(int argc, char **argv) {
    printf("\tShutting down...\n");
-  return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0)
+    return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
             ? EXIT_SUCCESS
             : EXIT_FAILURE;
 }
--- a/Samples/0_Introduction/simpleVoteIntrinsics/simpleVote_kernel.cuh
+++ b/Samples/0_Introduction/simpleVoteIntrinsics/simpleVote_kernel.cuh
@ -38,8 +38,8 @@
 // If ANY one of the threads (within the warp) of the predicated condition
 // returns a non-zero value, then all threads within this warp will return a
 // non-zero value
-__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result,
+__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result, int size)
-                               int size) {
+{
    int tx = threadIdx.x;
    int mask   = 0xffffffff;
@ -50,8 +50,8 @@ __global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result,
 // If ALL of the threads (within the warp) of the predicated condition returns
 // a non-zero value, then all threads within this warp will return a non-zero
 // value
-__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result,
+__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result, int size)
-                               int size) {
+{
    int tx = threadIdx.x;
    int mask   = 0xffffffff;
@ -60,7 +60,8 @@ __global__ void VoteAllKernel2(unsigned int *input, unsigned int *result,
 // Kernel #3 is a directed test for the across-the-warp vote(all) intrinsic.
 // This kernel will test for conditions across warps, and within half warps
-__global__ void VoteAnyKernel3(bool *info, int warp_size) {
+__global__ void VoteAnyKernel3(bool *info, int warp_size)
 {
    int          tx   = threadIdx.x;
    unsigned int mask = 0xffffffff;
    bool        *offs = info + (tx * 3);
--- a/Samples/0_Introduction/simpleZeroCopy/simpleZeroCopy.cu
+++ b/Samples/0_Introduction/simpleZeroCopy/simpleZeroCopy.cu
@ -41,7 +41,8 @@
 #endif
 /* Add two vectors on the GPU */
-__global__ void vectorAddGPU(float *a, float *b, float *c, int N) {
+__global__ void vectorAddGPU(float *a, float *b, float *c, int N)
 {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
@ -57,7 +58,8 @@ bool bPinGenericMemory = false;
 #define MEMORY_ALIGNMENT  4096
 #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    int            n, nelem, deviceCount;
    int            idev   = 0; // use default device 0
    char          *device = NULL;
@ -73,8 +75,7 @@ int main(int argc, char **argv) {
        printf("Usage:  simpleZeroCopy [OPTION]\n\n");
        printf("Options:\n");
        printf("  --device=[device #]  Specify the device to be used\n");
-    printf(
+        printf("  --use_generic_memory (optional) use generic page-aligned for system "
        "  --use_generic_memory (optional) use generic page-aligned for system "
               "memory\n");
        return EXIT_SUCCESS;
    }
@ -85,9 +86,7 @@ int main(int argc, char **argv) {
        idev = atoi(device);
        if (idev >= deviceCount || idev < 0) {
-      fprintf(stderr,
+            fprintf(stderr, "Device number %d is invalid, will use default CUDA device 0.\n", idev);
              "Device number %d is invalid, will use default CUDA device 0.\n",
              idev);
            idev = 0;
        }
    }
@ -108,7 +107,8 @@ int main(int argc, char **argv) {
    if (bPinGenericMemory) {
        printf("> Using Generic System Paged Memory (malloc)\n");
-  } else {
+    }
    else {
        printf("> Using CUDA Host Allocated (cudaHostAlloc)\n");
    }
@ -122,8 +122,7 @@ int main(int argc, char **argv) {
 #if CUDART_VERSION >= 2020
    if (!deviceProp.canMapHostMemory) {
-    fprintf(stderr, "Device %d does not support mapping CPU host memory!\n",
+        fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", idev);
            idev);
        exit(EXIT_SUCCESS);
    }
@ -133,7 +132,9 @@ int main(int argc, char **argv) {
    fprintf(stderr,
            "CUDART version %d.%d does not support "
            "<cudaDeviceProp.canMapHostMemory> field\n",
-          , CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10);
+            ,
            CUDART_VERSION / 1000,
            (CUDART_VERSION % 100) / 10);
    exit(EXIT_SUCCESS);
 #endif
@ -141,10 +142,10 @@ int main(int argc, char **argv) {
 #if CUDART_VERSION < 4000
    if (bPinGenericMemory) {
-    fprintf(
+        fprintf(stderr,
        stderr,
                "CUDART version %d.%d does not support <cudaHostRegister> function\n",
-        CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10);
+                CUDART_VERSION / 1000,
                (CUDART_VERSION % 100) / 10);
        exit(EXIT_SUCCESS);
    }
@ -172,7 +173,8 @@ int main(int argc, char **argv) {
        checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped));
        checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped));
 #endif
-  } else {
+    }
    else {
 #if CUDART_VERSION >= 2020
        flags = cudaHostAllocMapped;
        checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags));
@ -235,7 +237,8 @@ int main(int argc, char **argv) {
        free(b_UA);
        free(c_UA);
 #endif
-  } else {
+    }
    else {
 #if CUDART_VERSION >= 2020
        checkCudaErrors(cudaFreeHost(a));
        checkCudaErrors(cudaFreeHost(b));
--- a/Samples/0_Introduction/systemWideAtomics/README.md
+++ b/Samples/0_Introduction/systemWideAtomics/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## References (for more details)
--- a/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu
+++ b/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu
@ -29,19 +29,20 @@
 * memory.
 */
 #include <cstdio>
 #include <ctime>
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 #include <math.h>
 #include <stdint.h>
 #include <cstdio>
 #include <ctime>
 #define min(a, b) (a) < (b) ? (a) : (b)
 #define max(a, b) (a) > (b) ? (a) : (b)
 #define LOOP_NUM 50
-__global__ void atomicKernel(int *atom_arr) {
+__global__ void atomicKernel(int *atom_arr)
 {
    unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = 0; i < LOOP_NUM; i++) {
@ -79,7 +80,8 @@ __global__ void atomicKernel(int *atom_arr) {
    }
 }
-void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
+void atomicKernel_CPU(int *atom_arr, int no_of_threads)
 {
    for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
        for (int j = 0; j < LOOP_NUM; j++) {
            // Atomic addition
@ -92,23 +94,20 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
            int old, expected;
            do {
                expected = atom_arr[2];
-        old = __sync_val_compare_and_swap(&atom_arr[2], expected,
+                old      = __sync_val_compare_and_swap(&atom_arr[2], expected, max(expected, i));
                                          max(expected, i));
            } while (old != expected);
            // Atomic minimum
            do {
                expected = atom_arr[3];
-        old = __sync_val_compare_and_swap(&atom_arr[3], expected,
+                old      = __sync_val_compare_and_swap(&atom_arr[3], expected, min(expected, i));
                                          min(expected, i));
            } while (old != expected);
            // Atomic increment (modulo 17+1)
            int limit = 17;
            do {
                expected = atom_arr[4];
-        old = __sync_val_compare_and_swap(
+                old      = __sync_val_compare_and_swap(&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
            &atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
            } while (old != expected);
            // Atomic decrement
@ -116,8 +115,7 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
            do {
                expected = atom_arr[5];
                old      = __sync_val_compare_and_swap(
-            &atom_arr[5], expected,
+                    &atom_arr[5], expected, ((expected == 0) || (expected > limit)) ? limit : expected - 1);
            ((expected == 0) || (expected > limit)) ? limit : expected - 1);
            } while (old != expected);
            // Atomic compare-and-swap
@ -145,7 +143,8 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
 //! @param idata      input data as provided to device
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
-int verify(int *testData, const int len) {
+int verify(int *testData, const int len)
 {
    int val = 0;
    for (int i = 0; i < len * LOOP_NUM; ++i) {
@ -275,7 +274,8 @@ int verify(int *testData, const int len) {
    return true;
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    // set device
    cudaDeviceProp device_prop;
    int            dev_id = findCudaDevice(argc, (const char **)argv);
@ -296,8 +296,7 @@ int main(int argc, char **argv) {
    }
    if (device_prop.major < 6) {
-    printf(
+        printf("%s: requires a minimum CUDA compute 6.0 capability, waiving "
        "%s: requires a minimum CUDA compute 6.0 capability, waiving "
               "testing.\n",
               argv[0]);
        exit(EXIT_WAIVED);
@ -312,12 +311,14 @@ int main(int argc, char **argv) {
    if (device_prop.pageableMemoryAccess) {
        printf("CAN access pageable memory\n");
        atom_arr = (int *)malloc(sizeof(int) * numData);
-  } else {
+    }
    else {
        printf("CANNOT access pageable memory\n");
        checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
    }
-  for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0;
+    for (unsigned int i = 0; i < numData; i++)
        atom_arr[i] = 0;
    // To make the AND and XOR tests generate something other than 0...
    atom_arr[7] = atom_arr[9] = 0xff;
@ -332,11 +333,11 @@ int main(int argc, char **argv) {
    if (device_prop.pageableMemoryAccess) {
        free(atom_arr);
-  } else {
+    }
    else {
        cudaFree(atom_arr);
    }
-  printf("systemWideAtomics completed, returned %s \n",
+    printf("systemWideAtomics completed, returned %s \n", testResult ? "OK" : "ERROR!");
         testResult ? "OK" : "ERROR!");
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/template/template.cu
+++ b/Samples/0_Introduction/template/template.cu
@ -31,10 +31,10 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes CUDA
 #include <cuda_runtime.h>
@ -47,15 +47,15 @@
 // declaration, forward
 void runTest(int argc, char **argv);
-extern "C" void computeGold(float *reference, float *idata,
+extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
                            const unsigned int len);
 ////////////////////////////////////////////////////////////////////////////////
 //! Simple test kernel for device functionality
 //! @param g_idata  input data in global memory
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void testKernel(float *g_idata, float *g_odata) {
+__global__ void testKernel(float *g_idata, float *g_odata)
 {
    // shared memory
    // the size is determined by the host application
    extern __shared__ float sdata[];
@ -85,7 +85,8 @@ int main(int argc, char **argv) { runTest(argc, argv); }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
 {
    bool bTestResult = true;
    printf("%s Starting...\n\n", argv[0]);
@ -113,8 +114,7 @@ void runTest(int argc, char **argv) {
    float *d_idata;
    checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
    // copy host memory to device
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
      cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
    // allocate device memory for result
    float *d_odata;
@ -133,8 +133,7 @@ void runTest(int argc, char **argv) {
    // allocate mem for the result on host side
    float *h_odata = (float *)malloc(mem_size);
    // copy result from device to host
-  checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads,
+    checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, cudaMemcpyDeviceToHost));
                             cudaMemcpyDeviceToHost));
    sdkStopTimer(&timer);
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
@ -148,7 +147,8 @@ void runTest(int argc, char **argv) {
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
        // write file for regression test
        sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false);
-  } else {
+    }
    else {
        // custom output handling when no regression test running
        // in this case check if the result is equivalent to the expected solution
        bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
--- a/Samples/0_Introduction/template/template_cpu.cpp
+++ b/Samples/0_Introduction/template/template_cpu.cpp
@ -26,8 +26,7 @@
 */
 // export C interface
-extern "C" void computeGold(float *reference, float *idata,
+extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
                            const unsigned int len);
 ////////////////////////////////////////////////////////////////////////////////
 //! Compute reference data set
@ -36,7 +35,8 @@ extern "C" void computeGold(float *reference, float *idata,
 //! @param idata      input data as provided to device
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
-void computeGold(float *reference, float *idata, const unsigned int len) {
+void computeGold(float *reference, float *idata, const unsigned int len)
 {
    const float f_len = static_cast<float>(len);
    for (unsigned int i = 0; i < len; ++i) {
--- a/Samples/0_Introduction/vectorAdd/vectorAdd.cu
+++ b/Samples/0_Introduction/vectorAdd/vectorAdd.cu
@ -37,7 +37,6 @@
 // For the CUDA runtime routines (prefixed with "cuda_")
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 /**
 * CUDA Kernel Device code
@ -45,8 +44,8 @@
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
-__global__ void vectorAdd(const float *A, const float *B, float *C,
+__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
-                          int numElements) {
+{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < numElements) {
@ -57,7 +56,8 @@ __global__ void vectorAdd(const float *A, const float *B, float *C,
 /**
 * Host main routine
 */
-int main(void) {
+int main(void)
 {
    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;
@ -92,8 +92,7 @@ int main(void) {
    err        = cudaMalloc((void **)&d_A, size);
    if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
+        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
@ -102,8 +101,7 @@ int main(void) {
    err        = cudaMalloc((void **)&d_B, size);
    if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
+        fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
@ -112,8 +110,7 @@ int main(void) {
    err        = cudaMalloc((void **)&d_C, size);
    if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
+        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
@ -124,32 +121,26 @@ int main(void) {
    err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
-    fprintf(stderr,
+        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
            "Failed to copy vector A from host to device (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
-    fprintf(stderr,
+        fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
            "Failed to copy vector B from host to device (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid   = (numElements + threadsPerBlock - 1) / threadsPerBlock;
-  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
+    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
         threadsPerBlock);
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
    err = cudaGetLastError();
    if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
+        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
@ -159,9 +150,7 @@ int main(void) {
    err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) {
-    fprintf(stderr,
+        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
            "Failed to copy vector C from device to host (error code %s)!\n",
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
@ -179,24 +168,21 @@ int main(void) {
    err = cudaFree(d_A);
    if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
+        fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaFree(d_B);
    if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
+        fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaFree(d_C);
    if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
+        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
            cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
--- a/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp
+++ b/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp
@ -34,11 +34,11 @@
 */
 // Includes
 #include <stdio.h>
 #include <string.h>
 #include <iostream>
 #include <cstring>
 #include <cuda.h>
 #include <iostream>
 #include <stdio.h>
 #include <string.h>
 // includes, project
 #include <helper_cuda_drvapi.h>
@ -72,7 +72,8 @@ bool findModulePath(const char *, string &, char **, string &);
 #endif
 // Host code
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("Vector Addition (Driver API)\n");
    int    N = 50000, devID = 0;
    size_t size = N * sizeof(float);
@ -91,7 +92,8 @@ int main(int argc, char **argv) {
    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
        exit(EXIT_FAILURE);
-  } else {
+    }
    else {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }
@ -104,8 +106,7 @@ int main(int argc, char **argv) {
    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
    // Get function handle from module
-  checkCudaErrors(
+    checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
      cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
    // Allocate input vectors h_A and h_B in host memory
    h_A = (float *)malloc(size);
@ -139,9 +140,9 @@ int main(int argc, char **argv) {
        void *args[] = {&d_A, &d_B, &d_C, &N};
        // Launch the CUDA kernel
-    checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
+        checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
-                                   threadsPerBlock, 1, 1, 0, NULL, args, NULL));
+    }
-  } else {
+    else {
        // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
        // Launch (advanced method)
        int   offset = 0;
@ -160,9 +161,8 @@ int main(int argc, char **argv) {
        int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
        // Launch the CUDA kernel
-    checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
+        checkCudaErrors(
-                                   threadsPerBlock, 1, 1, 0, NULL, NULL,
+            cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, argBuffer));
                                   argBuffer));
    }
 #ifdef _DEBUG
@ -190,7 +190,8 @@ int main(int argc, char **argv) {
    exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-int CleanupNoFailure() {
+int CleanupNoFailure()
 {
    // Free device memory
    checkCudaErrors(cuMemFree(d_A));
    checkCudaErrors(cuMemFree(d_B));
@ -214,7 +215,8 @@ int CleanupNoFailure() {
    return EXIT_SUCCESS;
 }
 // Allocates an array with random float entries.
-void RandomInit(float *data, int n) {
+void RandomInit(float *data, int n)
 {
    for (int i = 0; i < n; ++i) {
        data[i] = rand() / (float)RAND_MAX;
    }
--- a/Samples/0_Introduction/vectorAddDrv/vectorAdd_kernel.cu
+++ b/Samples/0_Introduction/vectorAddDrv/vectorAdd_kernel.cu
@ -33,9 +33,10 @@
 */
 // Device code
-extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
+extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
-                                         float *C, int N) {
+{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < N) C[i] = A[i] + B[i];
+    if (i < N)
        C[i] = A[i] + B[i];
 }
--- a/Samples/0_Introduction/vectorAddMMAP/README.md
+++ b/Samples/0_Introduction/vectorAddMMAP/README.md
@ -30,4 +30,3 @@ cuMemcpyDtoH, cuDeviceCanAccessPeer, cuModuleGetFunction, cuMemSetAccess, cuMemR
 Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## References (for more details)
--- a/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.cpp
+++ b/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.cpp
@ -29,10 +29,13 @@
 static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
-CUresult simpleMallocMultiDeviceMmap(
+CUresult simpleMallocMultiDeviceMmap(CUdeviceptr                 *dptr,
-    CUdeviceptr *dptr, size_t *allocationSize, size_t size,
+                                     size_t                      *allocationSize,
                                     size_t                       size,
                                     const std::vector<CUdevice> &residentDevices,
-    const std::vector<CUdevice> &mappingDevices, size_t align) {
+                                     const std::vector<CUdevice> &mappingDevices,
                                     size_t                       align)
 {
    CUresult status          = CUDA_SUCCESS;
    size_t   min_granularity = 0;
    size_t   stripeSize;
@ -53,8 +56,7 @@ CUresult simpleMallocMultiDeviceMmap(
        // get the minnimum granularity for residentDevices[idx]
        prop.location.id = residentDevices[idx];
-    status = cuMemGetAllocationGranularity(&granularity, &prop,
+        status           = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
                                           CU_MEM_ALLOC_GRANULARITY_MINIMUM);
        if (status != CUDA_SUCCESS) {
            goto done;
        }
@ -70,8 +72,7 @@ CUresult simpleMallocMultiDeviceMmap(
        // get the minnimum granularity for mappingDevices[idx]
        prop.location.id = mappingDevices[idx];
-    status = cuMemGetAllocationGranularity(&granularity, &prop,
+        status           = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
                                           CU_MEM_ALLOC_GRANULARITY_MINIMUM);
        if (status != CUDA_SUCCESS) {
            goto done;
        }
@ -121,8 +122,7 @@ CUresult simpleMallocMultiDeviceMmap(
        // Since we do not need to make any other mappings of this memory or export
        // it, we no longer need and can release the allocationHandle. The
        // allocation will be kept live until it is unmapped.
-    status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0,
+        status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0, allocationHandle, 0);
                      allocationHandle, 0);
        // the handle needs to be released even if the mapping failed.
        status2 = cuMemRelease(allocationHandle);
@ -157,8 +157,7 @@ CUresult simpleMallocMultiDeviceMmap(
        }
        // Apply the access descriptors to the whole VA range.
-    status = cuMemSetAccess(*dptr, size, &accessDescriptors[0],
+        status = cuMemSetAccess(*dptr, size, &accessDescriptors[0], accessDescriptors.size());
                            accessDescriptors.size());
        if (status != CUDA_SUCCESS) {
            goto done;
        }
@ -174,7 +173,8 @@ done:
    return status;
 }
-CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size) {
+CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size)
 {
    CUresult status = CUDA_SUCCESS;
    // Unmap the mapped virtual memory region
--- a/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.hpp
+++ b/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.hpp
@ -63,10 +63,12 @@
 //! handle
 //!   is not needed after its mappings are set up.
 ////////////////////////////////////////////////////////////////////////////
-CUresult simpleMallocMultiDeviceMmap(
+CUresult simpleMallocMultiDeviceMmap(CUdeviceptr                 *dptr,
-    CUdeviceptr *dptr, size_t *allocationSize, size_t size,
+                                     size_t                      *allocationSize,
                                     size_t                       size,
                                     const std::vector<CUdevice> &residentDevices,
-    const std::vector<CUdevice> &mappingDevices, size_t align = 0);
+                                     const std::vector<CUdevice> &mappingDevices,
                                     size_t                       align = 0);
 ////////////////////////////////////////////////////////////////////////////
 //! Frees resources allocated by simpleMallocMultiDeviceMmap
--- a/Samples/0_Introduction/vectorAddMMAP/vectorAddMMAP.cpp
+++ b/Samples/0_Introduction/vectorAddMMAP/vectorAddMMAP.cpp
@ -36,11 +36,11 @@
 */
 // Includes
 #include <cstring>
 #include <cuda.h>
 #include <iostream>
 #include <stdio.h>
 #include <string.h>
 #include <cstring>
 #include <iostream>
 // includes, project
 #include <helper_cuda_drvapi.h>
@ -76,7 +76,8 @@ void RandomInit(float *, int);
 #endif
 // collect all of the devices whose memory can be mapped from cuDevice.
-vector<CUdevice> getBackingDevices(CUdevice cuDevice) {
+vector<CUdevice> getBackingDevices(CUdevice cuDevice)
 {
    int num_devices;
    checkCudaErrors(cuDeviceGetCount(&num_devices));
@ -100,9 +101,8 @@ vector<CUdevice> getBackingDevices(CUdevice cuDevice) {
        // The device needs to support virtual address management for the required
        // apis to work
-    checkCudaErrors(cuDeviceGetAttribute(
+        checkCudaErrors(
-        &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+            cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
        cuDevice));
        if (attributeVal == 0) {
            continue;
        }
@ -113,7 +113,8 @@ vector<CUdevice> getBackingDevices(CUdevice cuDevice) {
 }
 // Host code
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    printf("Vector Addition (Driver API)\n");
    int    N            = 50000;
    size_t size         = N * sizeof(float);
@ -125,11 +126,9 @@ int main(int argc, char **argv) {
    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
    // Check that the selected device supports virtual address management
-  checkCudaErrors(cuDeviceGetAttribute(
+    checkCudaErrors(
-      &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+        cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
-      cuDevice));
+    printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice, attributeVal);
  printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice,
         attributeVal);
    if (attributeVal == 0) {
        printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice);
        exit(EXIT_WAIVED);
@ -152,17 +151,14 @@ int main(int argc, char **argv) {
    std::ostringstream fatbin;
-    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
+    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
    {
        exit(EXIT_FAILURE);
    }
-    else
+    else {
    {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }
-    if (!fatbin.str().size())
+    if (!fatbin.str().size()) {
    {
        printf("fatbin file empty. exiting..\n");
        exit(EXIT_FAILURE);
    }
@ -207,10 +203,7 @@ int main(int argc, char **argv) {
    void *args[] = {&d_A, &d_B, &d_C, &N};
    // Launch the CUDA kernel
-    checkCudaErrors(cuLaunchKernel(vecAdd_kernel,  blocksPerGrid, 1, 1,
+    checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
                               threadsPerBlock, 1, 1,
                               0,
                               NULL, args, NULL));
    // Copy result from device memory to host memory
    // h_C contains the result in host memory
@ -219,12 +212,10 @@ int main(int argc, char **argv) {
    // Verify result
    int i;
-    for (i = 0; i < N; ++i)
+    for (i = 0; i < N; ++i) {
    {
        float sum = h_A[i] + h_B[i];
-        if (fabs(h_C[i] - sum) > 1e-7f)
+        if (fabs(h_C[i] - sum) > 1e-7f) {
        {
            break;
        }
    }
@ -243,18 +234,15 @@ int CleanupNoFailure()
    checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize));
    // Free host memory
-    if (h_A)
+    if (h_A) {
    {
        free(h_A);
    }
-    if (h_B)
+    if (h_B) {
    {
        free(h_B);
    }
-    if (h_C)
+    if (h_C) {
    {
        free(h_C);
    }
@ -265,8 +253,7 @@ int CleanupNoFailure()
 // Allocates an array with random float entries.
 void RandomInit(float *data, int n)
 {
-    for (int i = 0; i < n; ++i)
+    for (int i = 0; i < n; ++i) {
    {
        data[i] = rand() / (float)RAND_MAX;
    }
 }
--- a/Samples/0_Introduction/vectorAddMMAP/vectorAdd_kernel.cu
+++ b/Samples/0_Introduction/vectorAddMMAP/vectorAdd_kernel.cu
@ -34,9 +34,10 @@
 */
 // Device code
-extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
+extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
-                                         float *C, int N) {
+{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < N) C[i] = A[i] + B[i];
+    if (i < N)
        C[i] = A[i] + B[i];
 }
--- a/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd.cpp
+++ b/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd.cpp
@ -33,8 +33,8 @@
 * of the programming guide with some additions like error checking.
 */
 #include <stdio.h>
 #include <cmath>
 #include <stdio.h>
 // For the CUDA runtime routines (prefixed with "cuda_")
 #include <cuda.h>
@ -42,13 +42,13 @@
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <nvrtc_helper.h>
 /**
 * Host main routine
 */
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    char  *cubin, *kernel_file;
    size_t cubinSize;
    kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
@ -105,19 +105,23 @@ int main(int argc, char **argv) {
    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid   = (numElements + threadsPerBlock - 1) / threadsPerBlock;
-  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
+    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
         threadsPerBlock);
    dim3 cudaBlockSize(threadsPerBlock, 1, 1);
    dim3 cudaGridSize(blocksPerGrid, 1, 1);
-  void *arr[] = {reinterpret_cast<void *>(&d_A), reinterpret_cast<void *>(&d_B),
+    void *arr[] = {reinterpret_cast<void *>(&d_A),
                   reinterpret_cast<void *>(&d_B),
                   reinterpret_cast<void *>(&d_C),
                   reinterpret_cast<void *>(&numElements)};
-  checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
                                   cudaGridSize.x,
                                   cudaGridSize.y,
                                   cudaGridSize.z, /* grid dim */
-                                 cudaBlockSize.x, cudaBlockSize.y,
+                                   cudaBlockSize.x,
                                   cudaBlockSize.y,
                                   cudaBlockSize.z, /* block dim */
-                                 0, 0,            /* shared mem, stream */
+                                   0,
                                   0,       /* shared mem, stream */
                                   &arr[0], /* arguments */
                                   0));
    checkCudaErrors(cuCtxSynchronize());
--- a/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd_kernel.cu
+++ b/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd_kernel.cu
@ -32,8 +32,8 @@
 * number of elements numElements.
 */
-extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
+extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
-                                     int numElements) {
+{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < numElements) {
--- a/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu
+++ b/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu
@ -39,12 +39,10 @@
 #include <cuda_runtime.h>
 // includes
 #include <cassert>
 #include <cuda.h>
 #include <helper_cuda.h>      // helper functions for CUDA error checking and initialization
 #include <helper_functions.h> // helper for shared functions common to CUDA Samples
 #include <cuda.h>
 #include <cassert>
 #include <iostream>
 #include <memory>
@ -83,8 +81,7 @@ enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
 enum printMode { USER_READABLE, CSV };
 enum memoryMode { PINNED, PAGEABLE };
-const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device",
+const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", "Device to Device", NULL};
                                 "Device to Device", NULL};
 const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
@ -97,36 +94,62 @@ char **pArgv = NULL;
 ////////////////////////////////////////////////////////////////////////////////
 // declaration, forward
 int   runTest(const int argc, const char **argv);
-void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
+void  testBandwidth(unsigned int start,
-                   testMode mode, memcpyKind kind, printMode printmode,
+                    unsigned int end,
-                   memoryMode memMode, int startDevice, int endDevice, bool wc);
+                    unsigned int increment,
-void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
+                    testMode     mode,
-                        memoryMode memMode, int startDevice, int endDevice,
+                    memcpyKind   kind,
                    printMode    printmode,
                    memoryMode   memMode,
                    int          startDevice,
                    int          endDevice,
                    bool         wc);
-void testBandwidthRange(unsigned int start, unsigned int end,
+void  testBandwidthQuick(unsigned int size,
-                        unsigned int increment, memcpyKind kind,
+                         memcpyKind   kind,
-                        printMode printmode, memoryMode memMode,
+                         printMode    printmode,
-                        int startDevice, int endDevice, bool wc);
+                         memoryMode   memMode,
-void testBandwidthShmoo(memcpyKind kind, printMode printmode,
+                         int          startDevice,
-                        memoryMode memMode, int startDevice, int endDevice,
+                         int          endDevice,
                         bool         wc);
-float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
+void  testBandwidthRange(unsigned int start,
                         unsigned int end,
                         unsigned int increment,
                         memcpyKind   kind,
                         printMode    printmode,
                         memoryMode   memMode,
                         int          startDevice,
                         int          endDevice,
                         bool         wc);
-float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
+void  testBandwidthShmoo(memcpyKind kind,
                         printMode  printmode,
                         memoryMode memMode,
                         int        startDevice,
                         int        endDevice,
                         bool       wc);
 float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc);
 float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc);
 float testDeviceToDeviceTransfer(unsigned int memSize);
-void printResultsReadable(unsigned int *memSizes, double *bandwidths,
+void  printResultsReadable(unsigned int *memSizes,
-                          unsigned int count, memcpyKind kind,
+                           double       *bandwidths,
-                          memoryMode memMode, int iNumDevs, bool wc);
+                           unsigned int  count,
-void printResultsCSV(unsigned int *memSizes, double *bandwidths,
+                           memcpyKind    kind,
-                     unsigned int count, memcpyKind kind, memoryMode memMode,
+                           memoryMode    memMode,
-                     int iNumDevs, bool wc);
+                           int           iNumDevs,
                           bool          wc);
 void  printResultsCSV(unsigned int *memSizes,
                      double       *bandwidths,
                      unsigned int  count,
                      memcpyKind    kind,
                      memoryMode    memMode,
                      int           iNumDevs,
                      bool          wc);
 void  printHelp(void);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    pArgc = &argc;
    pArgv = argv;
@ -144,8 +167,7 @@ int main(int argc, char **argv) {
    // finish
    printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
-  printf(
+    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
           "Results may vary when GPU Boost is enabled.\n");
    free(flush_buf);
@ -156,7 +178,8 @@ int main(int argc, char **argv) {
 ///////////////////////////////////////////////////////////////////////////////
 // Parse args, run the appropriate tests
 ///////////////////////////////////////////////////////////////////////////////
-int runTest(const int argc, const char **argv) {
+int runTest(const int argc, const char **argv)
 {
    int        start       = DEFAULT_SIZE;
    int        end         = DEFAULT_SIZE;
    int        startDevice = 0;
@ -186,14 +209,17 @@ int runTest(const int argc, const char **argv) {
    if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
        if (strcmp(memModeStr, "pageable") == 0) {
            memMode = PAGEABLE;
-    } else if (strcmp(memModeStr, "pinned") == 0) {
+        }
        else if (strcmp(memModeStr, "pinned") == 0) {
            memMode = PINNED;
-    } else {
+        }
        else {
            printf("Invalid memory mode - valid modes are pageable or pinned\n");
            printf("See --help for more information\n");
            return -1000;
        }
-  } else {
+    }
    else {
        // default - pinned memory
        memMode = PINNED;
    }
@ -203,8 +229,7 @@ int runTest(const int argc, const char **argv) {
        cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
        if (error_id != cudaSuccess) {
-      printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id,
+            printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
             cudaGetErrorString(error_id));
            exit(EXIT_FAILURE);
        }
@ -214,19 +239,19 @@ int runTest(const int argc, const char **argv) {
        }
        if (strcmp(device, "all") == 0) {
-      printf(
+            printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices "
          "\n!!!!!Cumulative Bandwidth to be computed from all the devices "
                   "!!!!!!\n\n");
            startDevice = 0;
            endDevice   = deviceCount - 1;
-    } else {
+        }
        else {
            startDevice = endDevice = atoi(device);
            if (startDevice >= deviceCount || startDevice < 0) {
-        printf(
+                printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
            "\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
                       "used !!!!!\n",
-            startDevice, 0);
+                       startDevice,
                       0);
                startDevice = endDevice = 0;
            }
        }
@ -234,8 +259,7 @@ int runTest(const int argc, const char **argv) {
    printf("Running on...\n\n");
-  for (int currentDevice = startDevice; currentDevice <= endDevice;
+    for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
       currentDevice++) {
        cudaDeviceProp deviceProp;
        cudaError_t    error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
@ -250,9 +274,9 @@ int runTest(const int argc, const char **argv) {
                exit(EXIT_FAILURE);
            }
-    } else {
+        }
-      printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id,
+        else {
-             cudaGetErrorString(error_id));
+            printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
            checkCudaErrors(cudaSetDevice(currentDevice));
            exit(EXIT_FAILURE);
@ -264,18 +288,22 @@ int runTest(const int argc, const char **argv) {
        if (strcmp(modeStr, "quick") == 0) {
            printf(" Quick Mode\n\n");
            mode = QUICK_MODE;
-    } else if (strcmp(modeStr, "shmoo") == 0) {
+        }
        else if (strcmp(modeStr, "shmoo") == 0) {
            printf(" Shmoo Mode\n\n");
            mode = SHMOO_MODE;
-    } else if (strcmp(modeStr, "range") == 0) {
+        }
        else if (strcmp(modeStr, "range") == 0) {
            printf(" Range Mode\n\n");
            mode = RANGE_MODE;
-    } else {
+        }
        else {
            printf("Invalid mode - valid modes are quick, range, or shmoo\n");
            printf("See --help for more information\n");
            return -3000;
        }
-  } else {
+    }
    else {
        // default mode - quick
        printf(" Quick Mode\n\n");
        mode = QUICK_MODE;
@ -320,7 +348,8 @@ int runTest(const int argc, const char **argv) {
                printf("Illegal argument - start must be greater than zero\n");
                return -4000;
            }
-    } else {
+        }
        else {
            printf("Must specify a starting size in range mode\n");
            printf("See --help for more information\n");
            return -5000;
@ -338,7 +367,8 @@ int runTest(const int argc, const char **argv) {
                printf("Illegal argument - start is greater than end\n");
                return -7000;
            }
-    } else {
+        }
        else {
            printf("Must specify an end size in range mode.\n");
            printf("See --help for more information\n");
            return -8000;
@ -351,7 +381,8 @@ int runTest(const int argc, const char **argv) {
                printf("Illegal argument - increment must be greater than zero\n");
                return -9000;
            }
-    } else {
+        }
        else {
            printf("Must specify an increment in user mode\n");
            printf("See --help for more information\n");
            return -10000;
@ -359,21 +390,42 @@ int runTest(const int argc, const char **argv) {
    }
    if (htod) {
-    testBandwidth((unsigned int)start, (unsigned int)end,
+        testBandwidth((unsigned int)start,
-                  (unsigned int)increment, mode, HOST_TO_DEVICE, printmode,
+                      (unsigned int)end,
-                  memMode, startDevice, endDevice, wc);
+                      (unsigned int)increment,
                      mode,
                      HOST_TO_DEVICE,
                      printmode,
                      memMode,
                      startDevice,
                      endDevice,
                      wc);
    }
    if (dtoh) {
-    testBandwidth((unsigned int)start, (unsigned int)end,
+        testBandwidth((unsigned int)start,
-                  (unsigned int)increment, mode, DEVICE_TO_HOST, printmode,
+                      (unsigned int)end,
-                  memMode, startDevice, endDevice, wc);
+                      (unsigned int)increment,
                      mode,
                      DEVICE_TO_HOST,
                      printmode,
                      memMode,
                      startDevice,
                      endDevice,
                      wc);
    }
    if (dtod) {
-    testBandwidth((unsigned int)start, (unsigned int)end,
+        testBandwidth((unsigned int)start,
-                  (unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode,
+                      (unsigned int)end,
-                  memMode, startDevice, endDevice, wc);
+                      (unsigned int)increment,
                      mode,
                      DEVICE_TO_DEVICE,
                      printmode,
                      memMode,
                      startDevice,
                      endDevice,
                      wc);
    }
    // Ensure that we reset all CUDA Devices in question
@ -387,19 +439,24 @@ int runTest(const int argc, const char **argv) {
 ///////////////////////////////////////////////////////////////////////////////
 //  Run a bandwidth test
 ///////////////////////////////////////////////////////////////////////////////
-void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
+void testBandwidth(unsigned int start,
-                   testMode mode, memcpyKind kind, printMode printmode,
+                   unsigned int end,
-                   memoryMode memMode, int startDevice, int endDevice,
+                   unsigned int increment,
-                   bool wc) {
+                   testMode     mode,
                   memcpyKind   kind,
                   printMode    printmode,
                   memoryMode   memMode,
                   int          startDevice,
                   int          endDevice,
                   bool         wc)
 {
    switch (mode) {
    case QUICK_MODE:
-      testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice,
+        testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc);
                         endDevice, wc);
        break;
    case RANGE_MODE:
-      testBandwidthRange(start, end, increment, kind, printmode, memMode,
+        testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc);
                         startDevice, endDevice, wc);
        break;
    case SHMOO_MODE:
@ -414,20 +471,30 @@ void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
 //////////////////////////////////////////////////////////////////////
 //  Run a quick mode bandwidth test
 //////////////////////////////////////////////////////////////////////
-void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
+void testBandwidthQuick(unsigned int size,
-                        memoryMode memMode, int startDevice, int endDevice,
+                        memcpyKind   kind,
-                        bool wc) {
+                        printMode    printmode,
-  testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode,
+                        memoryMode   memMode,
-                     startDevice, endDevice, wc);
+                        int          startDevice,
                        int          endDevice,
                        bool         wc)
 {
    testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc);
 }
 ///////////////////////////////////////////////////////////////////////
 //  Run a range mode bandwidth test
 //////////////////////////////////////////////////////////////////////
-void testBandwidthRange(unsigned int start, unsigned int end,
+void testBandwidthRange(unsigned int start,
-                        unsigned int increment, memcpyKind kind,
+                        unsigned int end,
-                        printMode printmode, memoryMode memMode,
+                        unsigned int increment,
-                        int startDevice, int endDevice, bool wc) {
+                        memcpyKind   kind,
                        printMode    printmode,
                        memoryMode   memMode,
                        int          startDevice,
                        int          endDevice,
                        bool         wc)
 {
    // count the number of copies we're going to run
    unsigned int count = 1 + ((end - start) / increment);
@ -441,8 +508,7 @@ void testBandwidthRange(unsigned int start, unsigned int end,
    }
    // Use the device asked by the user
-  for (int currentDevice = startDevice; currentDevice <= endDevice;
+    for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
       currentDevice++) {
        cudaSetDevice(currentDevice);
        // run each of the copies
@ -467,11 +533,10 @@ void testBandwidthRange(unsigned int start, unsigned int end,
    // print results
    if (printmode == CSV) {
-    printResultsCSV(memSizes, bandwidths, count, kind, memMode,
+        printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
-                    (1 + endDevice - startDevice), wc);
+    }
-  } else {
+    else {
-    printResultsReadable(memSizes, bandwidths, count, kind, memMode,
+        printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
                         (1 + endDevice - startDevice), wc);
    }
    // clean up
@ -482,18 +547,21 @@ void testBandwidthRange(unsigned int start, unsigned int end,
 //////////////////////////////////////////////////////////////////////////////
 // Intense shmoo mode - covers a large range of values with varying increments
 //////////////////////////////////////////////////////////////////////////////
-void testBandwidthShmoo(memcpyKind kind, printMode printmode,
+void testBandwidthShmoo(memcpyKind kind,
-                        memoryMode memMode, int startDevice, int endDevice,
+                        printMode  printmode,
-                        bool wc) {
+                        memoryMode memMode,
                        int        startDevice,
                        int        endDevice,
                        bool       wc)
 {
    // count the number of copies to make
-  unsigned int count =
+    unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB)
-      1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) +
+                       + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
-      ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) +
+                       + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
-      ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) +
+                       + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
-      ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) +
+                       + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
-      ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) +
+                       + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
-      ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) +
+                       + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
      ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
    unsigned int *memSizes   = (unsigned int *)malloc(count * sizeof(unsigned int));
    double       *bandwidths = (double *)malloc(count * sizeof(double));
@ -505,8 +573,7 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
    }
    // Use the device asked by the user
-  for (int currentDevice = startDevice; currentDevice <= endDevice;
+    for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
       currentDevice++) {
        cudaSetDevice(currentDevice);
        // Run the shmoo
        int          iteration = 0;
@ -515,17 +582,23 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
        while (memSize <= SHMOO_MEMSIZE_MAX) {
            if (memSize < SHMOO_LIMIT_20KB) {
                memSize += SHMOO_INCREMENT_1KB;
-      } else if (memSize < SHMOO_LIMIT_50KB) {
+            }
            else if (memSize < SHMOO_LIMIT_50KB) {
                memSize += SHMOO_INCREMENT_2KB;
-      } else if (memSize < SHMOO_LIMIT_100KB) {
+            }
            else if (memSize < SHMOO_LIMIT_100KB) {
                memSize += SHMOO_INCREMENT_10KB;
-      } else if (memSize < SHMOO_LIMIT_1MB) {
+            }
            else if (memSize < SHMOO_LIMIT_1MB) {
                memSize += SHMOO_INCREMENT_100KB;
-      } else if (memSize < SHMOO_LIMIT_16MB) {
+            }
            else if (memSize < SHMOO_LIMIT_16MB) {
                memSize += SHMOO_INCREMENT_1MB;
-      } else if (memSize < SHMOO_LIMIT_32MB) {
+            }
            else if (memSize < SHMOO_LIMIT_32MB) {
                memSize += SHMOO_INCREMENT_2MB;
-      } else {
+            }
            else {
                memSize += SHMOO_INCREMENT_4MB;
            }
@ -533,18 +606,15 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
            switch (kind) {
            case DEVICE_TO_HOST:
-          bandwidths[iteration] +=
+                bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
              testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
                break;
            case HOST_TO_DEVICE:
-          bandwidths[iteration] +=
+                bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
              testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
                break;
            case DEVICE_TO_DEVICE:
-          bandwidths[iteration] +=
+                bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]);
              testDeviceToDeviceTransfer(memSizes[iteration]);
                break;
            }
@ -558,11 +628,10 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
    printf("\n");
    if (CSV == printmode) {
-    printResultsCSV(memSizes, bandwidths, count, kind, memMode,
+        printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
-                    (1 + endDevice - startDevice), wc);
+    }
-  } else {
+    else {
-    printResultsReadable(memSizes, bandwidths, count, kind, memMode,
+        printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
                         (1 + endDevice - startDevice), wc);
    }
    // clean up
@ -573,8 +642,8 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
 ///////////////////////////////////////////////////////////////////////////////
 //  test the bandwidth of a device to host memcopy of a specific size
 ///////////////////////////////////////////////////////////////////////////////
-float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
+float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc)
-                               bool wc) {
+{
    StopWatchInterface *timer           = NULL;
    float               elapsedTimeInMs = 0.0f;
    float               bandwidthInGBs  = 0.0f;
@ -590,15 +659,14 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
    if (PINNED == memMode) {
        // pinned memory mode - use special function to get OS-pinned memory
 #if CUDART_VERSION >= 2020
-    checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize,
+        checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
-                                  (wc) ? cudaHostAllocWriteCombined : 0));
+        checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
    checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
                                  (wc) ? cudaHostAllocWriteCombined : 0));
 #else
        checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
        checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
 #endif
-  } else {
+    }
    else {
        // pageable memory mode - use malloc
        h_idata = (unsigned char *)malloc(memSize);
        h_odata = (unsigned char *)malloc(memSize);
@ -619,16 +687,15 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
    checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
    // initialize the device memory
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
      cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
    // copy data from GPU to Host
    if (PINNED == memMode) {
-    if (bDontUseGPUTiming) sdkStartTimer(&timer);
+        if (bDontUseGPUTiming)
            sdkStartTimer(&timer);
        checkCudaErrors(cudaEventRecord(start, 0));
        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
-      checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
+            checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost, 0));
                                      cudaMemcpyDeviceToHost, 0));
        }
        checkCudaErrors(cudaEventRecord(stop, 0));
        checkCudaErrors(cudaDeviceSynchronize());
@ -638,12 +705,12 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
            elapsedTimeInMs = sdkGetTimerValue(&timer);
            sdkResetTimer(&timer);
        }
-  } else {
+    }
    else {
        elapsedTimeInMs = 0;
        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
            sdkStartTimer(&timer);
-      checkCudaErrors(
+            checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
          cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
            sdkStopTimer(&timer);
            elapsedTimeInMs += sdkGetTimerValue(&timer);
            sdkResetTimer(&timer);
@ -663,7 +730,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
    if (PINNED == memMode) {
        checkCudaErrors(cudaFreeHost(h_idata));
        checkCudaErrors(cudaFreeHost(h_odata));
-  } else {
+    }
    else {
        free(h_idata);
        free(h_odata);
    }
@ -676,8 +744,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
 ///////////////////////////////////////////////////////////////////////////////
 //! test the bandwidth of a host to device memcopy of a specific size
 ///////////////////////////////////////////////////////////////////////////////
-float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
+float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc)
-                               bool wc) {
+{
    StopWatchInterface *timer           = NULL;
    float               elapsedTimeInMs = 0.0f;
    float               bandwidthInGBs  = 0.0f;
@ -692,13 +760,13 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
    if (PINNED == memMode) {
 #if CUDART_VERSION >= 2020
        // pinned memory mode - use special function to get OS-pinned memory
-    checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
+        checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
                                  (wc) ? cudaHostAllocWriteCombined : 0));
 #else
        // pinned memory mode - use special function to get OS-pinned memory
        checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
 #endif
-  } else {
+    }
    else {
        // pageable memory mode - use malloc
        h_odata = (unsigned char *)malloc(memSize);
@ -732,11 +800,11 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
    // copy host memory to device memory
    if (PINNED == memMode) {
-    if (bDontUseGPUTiming) sdkStartTimer(&timer);
+        if (bDontUseGPUTiming)
            sdkStartTimer(&timer);
        checkCudaErrors(cudaEventRecord(start, 0));
        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
-      checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
+            checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, cudaMemcpyHostToDevice, 0));
                                      cudaMemcpyHostToDevice, 0));
        }
        checkCudaErrors(cudaEventRecord(stop, 0));
        checkCudaErrors(cudaDeviceSynchronize());
@ -746,12 +814,12 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
            elapsedTimeInMs = sdkGetTimerValue(&timer);
            sdkResetTimer(&timer);
        }
-  } else {
+    }
    else {
        elapsedTimeInMs = 0;
        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
            sdkStartTimer(&timer);
-      checkCudaErrors(
+            checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
          cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
            sdkStopTimer(&timer);
            elapsedTimeInMs += sdkGetTimerValue(&timer);
            sdkResetTimer(&timer);
@ -770,7 +838,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
    if (PINNED == memMode) {
        checkCudaErrors(cudaFreeHost(h_odata));
-  } else {
+    }
    else {
        free(h_odata);
    }
@ -784,7 +853,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
 ///////////////////////////////////////////////////////////////////////////////
 //! test the bandwidth of a device to device memcopy of a specific size
 ///////////////////////////////////////////////////////////////////////////////
-float testDeviceToDeviceTransfer(unsigned int memSize) {
+float testDeviceToDeviceTransfer(unsigned int memSize)
 {
    StopWatchInterface *timer           = NULL;
    float               elapsedTimeInMs = 0.0f;
    float               bandwidthInGBs  = 0.0f;
@ -814,16 +884,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) {
    checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
    // initialize memory
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
      cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
    // run the memcopy
    sdkStartTimer(&timer);
    checkCudaErrors(cudaEventRecord(start, 0));
    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
-    checkCudaErrors(
+        checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
        cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
    }
    checkCudaErrors(cudaEventRecord(stop, 0));
@ -860,9 +928,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) {
 /////////////////////////////////////////////////////////
 // print results in an easily read format
 ////////////////////////////////////////////////////////
-void printResultsReadable(unsigned int *memSizes, double *bandwidths,
+void printResultsReadable(unsigned int *memSizes,
-                          unsigned int count, memcpyKind kind,
+                          double       *bandwidths,
-                          memoryMode memMode, int iNumDevs, bool wc) {
+                          unsigned int  count,
                          memcpyKind    kind,
                          memoryMode    memMode,
                          int           iNumDevs,
                          bool          wc)
 {
    printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
    printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
@ -874,35 +947,41 @@ void printResultsReadable(unsigned int *memSizes, double *bandwidths,
    unsigned int i;
    for (i = 0; i < (count - 1); i++) {
-    printf("   %u\t\t\t%s%.1f\n", memSizes[i],
+        printf("   %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
           (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
    }
-  printf("   %u\t\t\t%s%.1f\n\n", memSizes[i],
+    printf("   %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
         (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
 }
 ///////////////////////////////////////////////////////////////////////////
 // print results in a database format
 ///////////////////////////////////////////////////////////////////////////
-void printResultsCSV(unsigned int *memSizes, double *bandwidths,
+void printResultsCSV(unsigned int *memSizes,
-                     unsigned int count, memcpyKind kind, memoryMode memMode,
+                     double       *bandwidths,
-                     int iNumDevs, bool wc) {
+                     unsigned int  count,
                     memcpyKind    kind,
                     memoryMode    memMode,
                     int           iNumDevs,
                     bool          wc)
 {
    std::string sConfig;
    // log config information
    if (kind == DEVICE_TO_DEVICE) {
        sConfig += "D2D";
-  } else {
+    }
    else {
        if (kind == DEVICE_TO_HOST) {
            sConfig += "D2H";
-    } else if (kind == HOST_TO_DEVICE) {
+        }
        else if (kind == HOST_TO_DEVICE) {
            sConfig += "H2D";
        }
        if (memMode == PAGEABLE) {
            sConfig += "-Paged";
-    } else if (memMode == PINNED) {
+        }
        else if (memMode == PINNED) {
            sConfig += "-Pinned";
            if (wc) {
@ -916,27 +995,28 @@ void printResultsCSV(unsigned int *memSizes, double *bandwidths,
    for (i = 0; i < count; i++) {
        dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9));
-    printf(
+        printf("bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
        "bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
               "bytes, NumDevsUsed = %d\n",
-        sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
+               sConfig.c_str(),
               bandwidths[i],
               dSeconds,
               memSizes[i],
               iNumDevs);
    }
 }
 ///////////////////////////////////////////////////////////////////////////
 // Print help screen
 ///////////////////////////////////////////////////////////////////////////
-void printHelp(void) {
+void printHelp(void)
 {
    printf("Usage:  bandwidthTest [OPTION]...\n");
-  printf(
+    printf("Test the bandwidth for device to host, host to device, and device to "
      "Test the bandwidth for device to host, host to device, and device to "
           "device transfers\n");
    printf("\n");
-  printf(
+    printf("Example:  measure the bandwidth of device to host pinned memory copies "
      "Example:  measure the bandwidth of device to host pinned memory copies "
           "in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
-  printf(
+    printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
      "./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
           "--increment=1024 --dtoh\n");
    printf("\n");
--- a/Samples/1_Utilities/deviceQuery/deviceQuery.cpp
+++ b/Samples/1_Utilities/deviceQuery/deviceQuery.cpp
@ -32,7 +32,6 @@
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 #include <iostream>
 #include <memory>
 #include <string>
@ -46,16 +45,13 @@ char **pArgv = NULL;
 #include <cuda.h>
 // This function wraps the CUDA Driver API into a template function
-template <class T>
+template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
-inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
+{
                             int device) {
    CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
    if (CUDA_SUCCESS != error) {
        fprintf(
-        stderr,
+            stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__);
        "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
        error, __FILE__, __LINE__);
        exit(EXIT_FAILURE);
    }
@ -66,20 +62,19 @@ inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    pArgc = &argc;
    pArgv = argv;
    printf("%s Starting...\n\n", argv[0]);
-  printf(
+    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
      " CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
    int         deviceCount = 0;
    cudaError_t error_id    = cudaGetDeviceCount(&deviceCount);
    if (error_id != cudaSuccess) {
-    printf("cudaGetDeviceCount returned %d\n-> %s\n",
+        printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast<int>(error_id), cudaGetErrorString(error_id));
           static_cast<int>(error_id), cudaGetErrorString(error_id));
        printf("Result = FAIL\n");
        exit(EXIT_FAILURE);
    }
@ -87,7 +82,8 @@ int main(int argc, char **argv) {
    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0) {
        printf("There are no available device(s) that support CUDA\n");
-  } else {
+    }
    else {
        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
    }
@ -104,20 +100,23 @@ int main(int argc, char **argv) {
        cudaDriverGetVersion(&driverVersion);
        cudaRuntimeGetVersion(&runtimeVersion);
        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
-           driverVersion / 1000, (driverVersion % 100) / 10,
+               driverVersion / 1000,
-           runtimeVersion / 1000, (runtimeVersion % 100) / 10);
+               (driverVersion % 100) / 10,
-    printf("  CUDA Capability Major/Minor version number:    %d.%d\n",
+               runtimeVersion / 1000,
-           deviceProp.major, deviceProp.minor);
+               (runtimeVersion % 100) / 10);
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
        char msg[256];
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    sprintf_s(msg, sizeof(msg),
+        sprintf_s(msg,
                  sizeof(msg),
                  "  Total amount of global memory:                 %.0f MBytes "
                  "(%llu bytes)\n",
                  static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
                  (unsigned long long)deviceProp.totalGlobalMem);
 #else
-    snprintf(msg, sizeof(msg),
+        snprintf(msg,
                 sizeof(msg),
                 "  Total amount of global memory:                 %.0f MBytes "
                 "(%llu bytes)\n",
                 static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
@ -128,121 +127,100 @@ int main(int argc, char **argv) {
        printf("  (%03d) Multiprocessors, (%03d) CUDA Cores/MP:    %d CUDA Cores\n",
               deviceProp.multiProcessorCount,
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
-           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
+               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
-               deviceProp.multiProcessorCount);
+        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f "
    printf(
        "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
               "GHz)\n",
-        deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
+               deviceProp.clockRate * 1e-3f,
               deviceProp.clockRate * 1e-6f);
 #if CUDART_VERSION >= 5000
        // This is supported in CUDA 5.0 (runtime API device properties)
-    printf("  Memory Clock rate:                             %.0f Mhz\n",
+        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
-           deviceProp.memoryClockRate * 1e-3f);
+        printf("  Memory Bus Width:                              %d-bit\n", deviceProp.memoryBusWidth);
    printf("  Memory Bus Width:                              %d-bit\n",
           deviceProp.memoryBusWidth);
        if (deviceProp.l2CacheSize) {
-      printf("  L2 Cache Size:                                 %d bytes\n",
+            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
             deviceProp.l2CacheSize);
        }
 #else
        // This only available in CUDA 4.0-4.2 (but these were only exposed in the
        // CUDA Driver API)
        int memoryClock;
-    getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
-                          dev);
+        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
    printf("  Memory Clock rate:                             %.0f Mhz\n",
           memoryClock * 1e-3f);
        int memBusWidth;
-    getCudaAttribute<int>(&memBusWidth,
+        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
-                          CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
+        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
    printf("  Memory Bus Width:                              %d-bit\n",
           memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
        if (L2CacheSize) {
-      printf("  L2 Cache Size:                                 %d bytes\n",
+            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
             L2CacheSize);
        }
 #endif
-    printf(
+        printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
        "  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
               "%d), 3D=(%d, %d, %d)\n",
-        deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],
+               deviceProp.maxTexture1D,
-        deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],
+               deviceProp.maxTexture2D[0],
-        deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
+               deviceProp.maxTexture2D[1],
-    printf(
+               deviceProp.maxTexture3D[0],
-        "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
+               deviceProp.maxTexture3D[1],
-        deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
+               deviceProp.maxTexture3D[2]);
-    printf(
+        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
-        "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
+               deviceProp.maxTexture1DLayered[0],
               deviceProp.maxTexture1DLayered[1]);
        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
               "layers\n",
-        deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1],
+               deviceProp.maxTexture2DLayered[0],
               deviceProp.maxTexture2DLayered[1],
               deviceProp.maxTexture2DLayered[2]);
-    printf("  Total amount of constant memory:               %zu bytes\n",
+        printf("  Total amount of constant memory:               %zu bytes\n", deviceProp.totalConstMem);
-           deviceProp.totalConstMem);
+        printf("  Total amount of shared memory per block:       %zu bytes\n", deviceProp.sharedMemPerBlock);
-    printf("  Total amount of shared memory per block:       %zu bytes\n",
+        printf("  Total shared memory per multiprocessor:        %zu bytes\n", deviceProp.sharedMemPerMultiprocessor);
-           deviceProp.sharedMemPerBlock);
+        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
-    printf("  Total shared memory per multiprocessor:        %zu bytes\n",
+        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
-           deviceProp.sharedMemPerMultiprocessor);
+        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
-    printf("  Total number of registers available per block: %d\n",
+        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
           deviceProp.regsPerBlock);
    printf("  Warp size:                                     %d\n",
           deviceProp.warpSize);
    printf("  Maximum number of threads per multiprocessor:  %d\n",
           deviceProp.maxThreadsPerMultiProcessor);
    printf("  Maximum number of threads per block:           %d\n",
           deviceProp.maxThreadsPerBlock);
        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
-           deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
+               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
-           deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
+               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
-    printf("  Maximum memory pitch:                          %zu bytes\n",
+        printf("  Maximum memory pitch:                          %zu bytes\n", deviceProp.memPitch);
-           deviceProp.memPitch);
+        printf("  Texture alignment:                             %zu bytes\n", deviceProp.textureAlignment);
-    printf("  Texture alignment:                             %zu bytes\n",
+        printf("  Concurrent copy and kernel execution:          %s with %d copy "
           deviceProp.textureAlignment);
    printf(
        "  Concurrent copy and kernel execution:          %s with %d copy "
               "engine(s)\n",
-        (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
+               (deviceProp.deviceOverlap ? "Yes" : "No"),
               deviceProp.asyncEngineCount);
        printf("  Run time limit on kernels:                     %s\n",
               deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
-    printf("  Integrated GPU sharing Host Memory:            %s\n",
+        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
-           deviceProp.integrated ? "Yes" : "No");
+        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
-    printf("  Support host page-locked memory mapping:       %s\n",
+        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
-           deviceProp.canMapHostMemory ? "Yes" : "No");
+        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
    printf("  Alignment requirement for Surfaces:            %s\n",
           deviceProp.surfaceAlignment ? "Yes" : "No");
    printf("  Device has ECC support:                        %s\n",
           deviceProp.ECCEnabled ? "Enabled" : "Disabled");
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
-           deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)"
+               deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
                                : "WDDM (Windows Display Driver Model)");
 #endif
-    printf("  Device supports Unified Addressing (UVA):      %s\n",
+        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
-           deviceProp.unifiedAddressing ? "Yes" : "No");
+        printf("  Device supports Managed Memory:                %s\n", deviceProp.managedMemory ? "Yes" : "No");
    printf("  Device supports Managed Memory:                %s\n",
           deviceProp.managedMemory ? "Yes" : "No");
        printf("  Device supports Compute Preemption:            %s\n",
               deviceProp.computePreemptionSupported ? "Yes" : "No");
-    printf("  Supports Cooperative Kernel Launch:            %s\n",
+        printf("  Supports Cooperative Kernel Launch:            %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No");
           deviceProp.cooperativeLaunch ? "Yes" : "No");
        printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
               deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
-           deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
+               deviceProp.pciDomainID,
               deviceProp.pciBusID,
               deviceProp.pciDeviceID);
-    const char *sComputeMode[] = {
+        const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
        "Default (multiple host threads can use ::cudaSetDevice() with device "
                                      "simultaneously)",
                                      "Exclusive (only one host thread in one process is able to use "
                                      "::cudaSetDevice() with this device)",
@ -250,7 +228,8 @@ int main(int argc, char **argv) {
                                      "device)",
                                      "Exclusive Process (many threads in one process is able to use "
                                      "::cudaSetDevice() with this device)",
-        "Unknown", NULL};
+                                      "Unknown",
                                      NULL};
        printf("  Compute Mode:\n");
        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
    }
@ -286,10 +265,12 @@ int main(int argc, char **argv) {
                    if (gpuid[i] == gpuid[j]) {
                        continue;
                    }
-          checkCudaErrors(
+                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
              cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
                    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
-                 prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],
+                           prop[gpuid[i]].name,
                           gpuid[i],
                           prop[gpuid[j]].name,
                           gpuid[j],
                           can_access_peer ? "Yes" : "No");
                }
            }
@ -306,22 +287,18 @@ int main(int argc, char **argv) {
    // driver version
    sProfileString += ", CUDA Driver Version = ";
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-  sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000,
+    sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
            (driverVersion % 100) / 10);
 #else
-  snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
+    snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
           (driverVersion % 100) / 10);
 #endif
    sProfileString += cTemp;
    // Runtime version
    sProfileString += ", CUDA Runtime Version = ";
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-  sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000,
+    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
            (runtimeVersion % 100) / 10);
 #else
-  snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
+    snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
           (runtimeVersion % 100) / 10);
 #endif
    sProfileString += cTemp;
--- a/Samples/1_Utilities/deviceQueryDrv/deviceQueryDrv.cpp
+++ b/Samples/1_Utilities/deviceQueryDrv/deviceQueryDrv.cpp
@ -30,17 +30,17 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <cuda.h>
 #include <helper_cuda_drvapi.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    CUdevice dev;
    int      major = 0, minor = 0;
    int      deviceCount = 0;
@ -58,15 +58,14 @@ int main(int argc, char **argv) {
    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0) {
        printf("There are no available device(s) that support CUDA\n");
-  } else {
+    }
    else {
        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
    }
    for (dev = 0; dev < deviceCount; ++dev) {
-    checkCudaErrors(cuDeviceGetAttribute(
+        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
-        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
+        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
    checkCudaErrors(cuDeviceGetAttribute(
        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
        checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));
@ -75,9 +74,9 @@ int main(int argc, char **argv) {
        int driverVersion = 0;
        checkCudaErrors(cuDriverGetVersion(&driverVersion));
        printf("  CUDA Driver Version:                           %d.%d\n",
-           driverVersion / 1000, (driverVersion % 100) / 10);
+               driverVersion / 1000,
-    printf("  CUDA Capability Major/Minor version number:    %d.%d\n", major,
+               (driverVersion % 100) / 10);
-           minor);
+        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", major, minor);
        size_t totalGlobalMem;
        checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));
@ -91,231 +90,169 @@ int main(int argc, char **argv) {
        printf("%s", msg);
        int multiProcessorCount;
-    getCudaAttribute<int>(&multiProcessorCount,
+        getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
                          CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
        printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
-           multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor),
+               multiProcessorCount,
               _ConvertSMVer2CoresDRV(major, minor),
               _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
        int clockRate;
        getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-    printf(
+        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f "
        "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
               "GHz)\n",
-        clockRate * 1e-3f, clockRate * 1e-6f);
+               clockRate * 1e-3f,
               clockRate * 1e-6f);
        int memoryClock;
-    getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
-                          dev);
+        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
    printf("  Memory Clock rate:                             %.0f Mhz\n",
           memoryClock * 1e-3f);
        int memBusWidth;
-    getCudaAttribute<int>(&memBusWidth,
+        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
-                          CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
+        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
    printf("  Memory Bus Width:                              %d-bit\n",
           memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
        if (L2CacheSize) {
-      printf("  L2 Cache Size:                                 %d bytes\n",
+            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
             L2CacheSize);
        }
        int maxTex1D, maxTex2D[2], maxTex3D[3];
-    getCudaAttribute<int>(&maxTex1D,
+        getCudaAttribute<int>(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
+        getCudaAttribute<int>(&maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
-    getCudaAttribute<int>(&maxTex2D[0],
+        getCudaAttribute<int>(&maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
+        getCudaAttribute<int>(&maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
-    getCudaAttribute<int>(&maxTex2D[1],
+        getCudaAttribute<int>(&maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
+        getCudaAttribute<int>(&maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
-    getCudaAttribute<int>(&maxTex3D[0],
+        printf("  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) "
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
    getCudaAttribute<int>(&maxTex3D[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
    getCudaAttribute<int>(&maxTex3D[2],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
    printf(
        "  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) "
               "3D=(%d, %d, %d)\n",
-        maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1],
+               maxTex1D,
               maxTex2D[0],
               maxTex2D[1],
               maxTex3D[0],
               maxTex3D[1],
               maxTex3D[2]);
        int maxTex1DLayered[2];
-    getCudaAttribute<int>(&maxTex1DLayered[0],
+        getCudaAttribute<int>(&maxTex1DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev);
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH,
+        getCudaAttribute<int>(&maxTex1DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev);
-                          dev);
+        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
-    getCudaAttribute<int>(&maxTex1DLayered[1],
+               maxTex1DLayered[0],
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS,
+               maxTex1DLayered[1]);
                          dev);
    printf(
        "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
        maxTex1DLayered[0], maxTex1DLayered[1]);
        int maxTex2DLayered[3];
-    getCudaAttribute<int>(&maxTex2DLayered[0],
+        getCudaAttribute<int>(&maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev);
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH,
+        getCudaAttribute<int>(&maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev);
-                          dev);
+        getCudaAttribute<int>(&maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev);
-    getCudaAttribute<int>(&maxTex2DLayered[1],
+        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT,
                          dev);
    getCudaAttribute<int>(&maxTex2DLayered[2],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS,
                          dev);
    printf(
        "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
               "layers\n",
-        maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]);
+               maxTex2DLayered[0],
               maxTex2DLayered[1],
               maxTex2DLayered[2]);
        int totalConstantMemory;
-    getCudaAttribute<int>(&totalConstantMemory,
+        getCudaAttribute<int>(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
-                          CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
+        printf("  Total amount of constant memory:               %u bytes\n", totalConstantMemory);
    printf("  Total amount of constant memory:               %u bytes\n",
           totalConstantMemory);
        int sharedMemPerBlock;
-    getCudaAttribute<int>(&sharedMemPerBlock,
+        getCudaAttribute<int>(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
-                          CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
+        printf("  Total amount of shared memory per block:       %u bytes\n", sharedMemPerBlock);
    printf("  Total amount of shared memory per block:       %u bytes\n",
           sharedMemPerBlock);
        int regsPerBlock;
-    getCudaAttribute<int>(&regsPerBlock,
+        getCudaAttribute<int>(&regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
-                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
+        printf("  Total number of registers available per block: %d\n", regsPerBlock);
    printf("  Total number of registers available per block: %d\n",
           regsPerBlock);
        int warpSize;
        getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
        printf("  Warp size:                                     %d\n", warpSize);
        int maxThreadsPerMultiProcessor;
-    getCudaAttribute<int>(&maxThreadsPerMultiProcessor,
+        getCudaAttribute<int>(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
-                          CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
+        printf("  Maximum number of threads per multiprocessor:  %d\n", maxThreadsPerMultiProcessor);
                          dev);
    printf("  Maximum number of threads per multiprocessor:  %d\n",
           maxThreadsPerMultiProcessor);
        int maxThreadsPerBlock;
-    getCudaAttribute<int>(&maxThreadsPerBlock,
+        getCudaAttribute<int>(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
-                          CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
+        printf("  Maximum number of threads per block:           %d\n", maxThreadsPerBlock);
    printf("  Maximum number of threads per block:           %d\n",
           maxThreadsPerBlock);
        int blockDim[3];
-    getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+        getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
-                          dev);
+        getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
-    getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+        getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
-                          dev);
+        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", blockDim[0], blockDim[1], blockDim[2]);
    getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
                          dev);
    printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
           blockDim[0], blockDim[1], blockDim[2]);
        int gridDim[3];
        getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
        getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
        getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
-    printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)\n",
+        printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)\n", gridDim[0], gridDim[1], gridDim[2]);
           gridDim[0], gridDim[1], gridDim[2]);
        int textureAlign;
-    getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
+        getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev);
-                          dev);
+        printf("  Texture alignment:                             %u bytes\n", textureAlign);
    printf("  Texture alignment:                             %u bytes\n",
           textureAlign);
        int memPitch;
        getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
-    printf("  Maximum memory pitch:                          %u bytes\n",
+        printf("  Maximum memory pitch:                          %u bytes\n", memPitch);
           memPitch);
        int gpuOverlap;
        getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
        int asyncEngineCount;
-    getCudaAttribute<int>(&asyncEngineCount,
+        getCudaAttribute<int>(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
-                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
+        printf("  Concurrent copy and kernel execution:          %s with %d copy "
    printf(
        "  Concurrent copy and kernel execution:          %s with %d copy "
               "engine(s)\n",
-        (gpuOverlap ? "Yes" : "No"), asyncEngineCount);
+               (gpuOverlap ? "Yes" : "No"),
               asyncEngineCount);
        int kernelExecTimeoutEnabled;
-    getCudaAttribute<int>(&kernelExecTimeoutEnabled,
+        getCudaAttribute<int>(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
-                          CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
+        printf("  Run time limit on kernels:                     %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
    printf("  Run time limit on kernels:                     %s\n",
           kernelExecTimeoutEnabled ? "Yes" : "No");
        int integrated;
        getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
-    printf("  Integrated GPU sharing Host Memory:            %s\n",
+        printf("  Integrated GPU sharing Host Memory:            %s\n", integrated ? "Yes" : "No");
           integrated ? "Yes" : "No");
        int canMapHostMemory;
-    getCudaAttribute<int>(&canMapHostMemory,
+        getCudaAttribute<int>(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
-                          CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
+        printf("  Support host page-locked memory mapping:       %s\n", canMapHostMemory ? "Yes" : "No");
    printf("  Support host page-locked memory mapping:       %s\n",
           canMapHostMemory ? "Yes" : "No");
        int concurrentKernels;
-    getCudaAttribute<int>(&concurrentKernels,
+        getCudaAttribute<int>(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
-                          CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
+        printf("  Concurrent kernel execution:                   %s\n", concurrentKernels ? "Yes" : "No");
    printf("  Concurrent kernel execution:                   %s\n",
           concurrentKernels ? "Yes" : "No");
        int surfaceAlignment;
-    getCudaAttribute<int>(&surfaceAlignment,
+        getCudaAttribute<int>(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
-                          CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
+        printf("  Alignment requirement for Surfaces:            %s\n", surfaceAlignment ? "Yes" : "No");
    printf("  Alignment requirement for Surfaces:            %s\n",
           surfaceAlignment ? "Yes" : "No");
        int eccEnabled;
        getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
-    printf("  Device has ECC support:                        %s\n",
+        printf("  Device has ECC support:                        %s\n", eccEnabled ? "Enabled" : "Disabled");
           eccEnabled ? "Enabled" : "Disabled");
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        int tccDriver;
        getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
-           tccDriver ? "TCC (Tesla Compute Cluster Driver)"
+               tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
                     : "WDDM (Windows Display Driver Model)");
 #endif
        int unifiedAddressing;
-    getCudaAttribute<int>(&unifiedAddressing,
+        getCudaAttribute<int>(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
-                          CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
+        printf("  Device supports Unified Addressing (UVA):      %s\n", unifiedAddressing ? "Yes" : "No");
    printf("  Device supports Unified Addressing (UVA):      %s\n",
           unifiedAddressing ? "Yes" : "No");
        int managedMemory;
-    getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
+        getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, dev);
-                          dev);
+        printf("  Device supports Managed Memory:                %s\n", managedMemory ? "Yes" : "No");
    printf("  Device supports Managed Memory:                %s\n",
           managedMemory ? "Yes" : "No");
        int computePreemption;
-    getCudaAttribute<int>(&computePreemption,
+        getCudaAttribute<int>(&computePreemption, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, dev);
-                          CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
+        printf("  Device supports Compute Preemption:            %s\n", computePreemption ? "Yes" : "No");
                          dev);
    printf("  Device supports Compute Preemption:            %s\n",
           computePreemption ? "Yes" : "No");
        int cooperativeLaunch;
-    getCudaAttribute<int>(&cooperativeLaunch,
+        getCudaAttribute<int>(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
-                          CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
+        printf("  Supports Cooperative Kernel Launch:            %s\n", cooperativeLaunch ? "Yes" : "No");
    printf("  Supports Cooperative Kernel Launch:            %s\n",
           cooperativeLaunch ? "Yes" : "No");
        int cooperativeMultiDevLaunch;
-    getCudaAttribute<int>(&cooperativeMultiDevLaunch,
+        getCudaAttribute<int>(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev);
-                          CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH,
+        printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n", cooperativeMultiDevLaunch ? "Yes" : "No");
                          dev);
    printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
           cooperativeMultiDevLaunch ? "Yes" : "No");
        int pciDomainID, pciBusID, pciDeviceID;
        getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
        getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
        getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
-    printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
+        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", pciDomainID, pciBusID, pciDeviceID);
           pciDomainID, pciBusID, pciDeviceID);
-    const char *sComputeMode[] = {
+        const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
        "Default (multiple host threads can use ::cudaSetDevice() with device "
                                      "simultaneously)",
                                      "Exclusive (only one host thread in one process is able to use "
                                      "::cudaSetDevice() with this device)",
@ -323,7 +260,8 @@ int main(int argc, char **argv) {
                                      "device)",
                                      "Exclusive Process (many threads in one process is able to use "
                                      "::cudaSetDevice() with this device)",
-        "Unknown", NULL};
+                                      "Unknown",
                                      NULL};
        int computeMode;
        getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
@ -338,10 +276,8 @@ int main(int argc, char **argv) {
        int tccDriver     = 0;
        for (int i = 0; i < deviceCount; i++) {
-      checkCudaErrors(cuDeviceGetAttribute(
+            checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
-          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
+            checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
      checkCudaErrors(cuDeviceGetAttribute(
          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
            getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
            // Only boards based on Fermi or later can support P2P
@ -367,14 +303,15 @@ int main(int argc, char **argv) {
                    if (gpuid[i] == gpuid[j]) {
                        continue;
                    }
-          checkCudaErrors(
+                    checkCudaErrors(cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
              cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
                    checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
                    checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
-          printf(
+                    printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
              "> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
                           "%s\n",
-              deviceName0, gpuid[i], deviceName1, gpuid[j],
+                           deviceName0,
                           gpuid[i],
                           deviceName1,
                           gpuid[j],
                           can_access_peer ? "Yes" : "No");
                }
            }
--- a/Samples/1_Utilities/topologyQuery/README.md
+++ b/Samples/1_Utilities/topologyQuery/README.md
@ -30,4 +30,3 @@ cudaGetDeviceCount, cudaDeviceGetAttribute
 Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## References (for more details)
--- a/Samples/1_Utilities/topologyQuery/topologyQuery.cu
+++ b/Samples/1_Utilities/topologyQuery/topologyQuery.cu
@ -37,32 +37,30 @@
 #include <helper_cuda.h>
 #include <helper_functions.h> // helper for shared that are common to CUDA Samples
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
    int deviceCount = 0;
    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
    // Enumerates Device <-> Device links
    for (int device1 = 0; device1 < deviceCount; device1++) {
        for (int device2 = 0; device2 < deviceCount; device2++) {
-      if (device1 == device2) continue;
+            if (device1 == device2)
                continue;
            int perfRank        = 0;
            int atomicSupported = 0;
            int accessSupported = 0;
-      checkCudaErrors(cudaDeviceGetP2PAttribute(
+            checkCudaErrors(
-          &accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
+                cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
-      checkCudaErrors(cudaDeviceGetP2PAttribute(
+            checkCudaErrors(cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
-          &perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
+            checkCudaErrors(
-      checkCudaErrors(cudaDeviceGetP2PAttribute(
+                cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2));
          &atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1,
          device2));
            if (accessSupported) {
-        std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":"
+                std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl;
-                  << std::endl;
+                std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
        std::cout << "  * Atomic Supported: "
                  << (atomicSupported ? "yes" : "no") << std::endl;
                std::cout << "  * Perf Rank: " << perfRank << std::endl;
            }
        }
@ -71,11 +69,9 @@ int main(int argc, char **argv) {
    // Enumerates Device <-> Host links
    for (int device = 0; device < deviceCount; device++) {
        int atomicSupported = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(
+        checkCudaErrors(cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
        &atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
        std::cout << "GPU" << device << " <-> CPU:" << std::endl;
-    std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no")
+        std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
              << std::endl;
    }
    return 0;
--- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/README.md
+++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/README.md
@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## References (for more details)
--- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.cpp
+++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.cpp
@ -29,12 +29,14 @@
 // DESCRIPTION:   Simple CUDA consumer rendering sample app
 //
 #include <cuda_runtime.h>
 #include "cuda_consumer.h"
-#include "eglstrm_common.h"
+
 #include <cuda_runtime.h>
 #include <math.h>
 #include <unistd.h>
 #include "eglstrm_common.h"
 #if defined(EXTENSION_LIST)
 EXTENSION_LIST(EXTLST_EXTERN)
 #endif
@ -47,19 +49,22 @@ static int count_rel = 0;
 static double rel_time[25000] = {0}, total_time_rel = 0;
 void acquireApiStat(void);
-void acquireApiStat(void) {
+void acquireApiStat(void)
 {
    int    i   = 0;
    double min = 10000000, max = 0;
    double average_launch_time = 0, standard_deviation = 0;
-  if (count_acq == 0) return;
+    if (count_acq == 0)
        return;
    // lets compute the standard deviation
    min = max           = acquire_time[1];
    average_launch_time = (total_time_acq - acquire_time[0]) / count_acq;
    for (i = 1; i < count_acq; i++) {
-    standard_deviation += (acquire_time[i] - average_launch_time) *
+        standard_deviation += (acquire_time[i] - average_launch_time) * (acquire_time[i] - average_launch_time);
-                          (acquire_time[i] - average_launch_time);
+        if (acquire_time[i] < min)
-    if (acquire_time[i] < min) min = acquire_time[i];
+            min = acquire_time[i];
-    if (acquire_time[i] > max) max = acquire_time[i];
+        if (acquire_time[i] > max)
            max = acquire_time[i];
    }
    standard_deviation = sqrt(standard_deviation / count_acq);
    printf("acquire Avg: %lf\n", average_launch_time);
@ -70,10 +75,11 @@ void acquireApiStat(void) {
    min = max           = rel_time[1];
    average_launch_time = (total_time_rel - rel_time[0]) / count_rel;
    for (i = 1; i < count_rel; i++) {
-    standard_deviation += (rel_time[i] - average_launch_time) *
+        standard_deviation += (rel_time[i] - average_launch_time) * (rel_time[i] - average_launch_time);
-                          (rel_time[i] - average_launch_time);
+        if (rel_time[i] < min)
-    if (rel_time[i] < min) min = rel_time[i];
+            min = rel_time[i];
-    if (rel_time[i] > max) max = rel_time[i];
+        if (rel_time[i] > max)
            max = rel_time[i];
    }
    standard_deviation = sqrt(standard_deviation / count_rel);
    printf("release Avg: %lf\n", average_launch_time);
@ -81,8 +87,8 @@ void acquireApiStat(void) {
    printf("release min: %lf\n", min);
    printf("release max: %lf\n", max);
 }
-CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer,
+CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer, int frameNumber)
-                                  int frameNumber) {
+{
    CUresult        cuStatus = CUDA_SUCCESS;
    CUeglFrame      cudaEgl;
    struct timespec start, end;
@ -95,8 +101,7 @@ CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer,
    }
    while (1) {
-    if (!eglQueryStreamKHR(cudaConsumer->eglDisplay, cudaConsumer->eglStream,
+        if (!eglQueryStreamKHR(cudaConsumer->eglDisplay, cudaConsumer->eglStream, EGL_STREAM_STATE_KHR, &streamState)) {
                           EGL_STREAM_STATE_KHR, &streamState)) {
            printf("Cuda Consumer: eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
            cuStatus = CUDA_ERROR_UNKNOWN;
            goto done;
@ -115,33 +120,35 @@ CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer,
        getTime(&start);
    }
    cuStatus =
-      cuEGLStreamConsumerAcquireFrame(&(cudaConsumer->cudaConn), &cudaResource,
+        cuEGLStreamConsumerAcquireFrame(&(cudaConsumer->cudaConn), &cudaResource, &cudaConsumer->consCudaStream, 16000);
                                      &cudaConsumer->consCudaStream, 16000);
    if (cudaConsumer->profileAPI) {
        getTime(&end);
        curTime                   = TIME_DIFF(end, start);
        acquire_time[count_acq++] = curTime;
-    if (count_acq == 25000) count_acq = 0;
+        if (count_acq == 25000)
            count_acq = 0;
        total_time_acq += curTime;
    }
    if (cuStatus == CUDA_SUCCESS) {
        CUdeviceptr pDevPtr = 0;
        cudaError_t err;
-    cuStatus =
+        cuStatus = cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0);
        cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0);
        if (cuStatus != CUDA_SUCCESS) {
            printf("Cuda get resource failed with %d\n", cuStatus);
            goto done;
        }
        pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0];
-    err = cudaConsumer_filter(cudaConsumer->consCudaStream, (char *)pDevPtr,
+        err = cudaConsumer_filter(cudaConsumer->consCudaStream,
-                              WIDTH * 4, HEIGHT, PROD_DATA + frameNumber,
+                                  (char *)pDevPtr,
-                              CONS_DATA + frameNumber, frameNumber);
+                                  WIDTH * 4,
                                  HEIGHT,
                                  PROD_DATA + frameNumber,
                                  CONS_DATA + frameNumber,
                                  frameNumber);
        if (err != cudaSuccess) {
-      printf("Cuda Consumer: kernel failed with: %s\n",
+            printf("Cuda Consumer: kernel failed with: %s\n", cudaGetErrorString(err));
             cudaGetErrorString(err));
            goto done;
        }
    }
@ -150,8 +157,8 @@ done:
    return cuStatus;
 }
-CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *cudaConsumer,
+CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *cudaConsumer, int frameNumber)
-                                  int frameNumber) {
+{
    CUresult        cuStatus = CUDA_SUCCESS;
    struct timespec start, end;
    double          curTime;
@ -163,13 +170,13 @@ CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *cudaConsumer,
    if (cudaConsumer->profileAPI) {
        getTime(&start);
    }
-  cuStatus = cuEGLStreamConsumerReleaseFrame(
+    cuStatus = cuEGLStreamConsumerReleaseFrame(&cudaConsumer->cudaConn, cudaResource, &cudaConsumer->consCudaStream);
      &cudaConsumer->cudaConn, cudaResource, &cudaConsumer->consCudaStream);
    if (cudaConsumer->profileAPI) {
        getTime(&end);
        curTime               = TIME_DIFF(end, start);
        rel_time[count_rel++] = curTime;
-    if (count_rel == 25000) count_rel = 0;
+        if (count_rel == 25000)
            count_rel = 0;
        total_time_rel += curTime;
    }
    if (cuStatus != CUDA_SUCCESS) {
@ -181,7 +188,8 @@ done:
    return cuStatus;
 }
-CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) {
+CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer)
 {
    CUdevice device;
    CUresult status = CUDA_SUCCESS;
@ -190,34 +198,31 @@ CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) {
        return status;
    }
-  if (CUDA_SUCCESS !=
+    if (CUDA_SUCCESS != (status = cuDeviceGet(&device, cudaConsumer->cudaDevId))) {
      (status = cuDeviceGet(&device, cudaConsumer->cudaDevId))) {
        printf("failed to get CUDA device\n");
        return status;
    }
-  if (CUDA_SUCCESS !=
+    if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaConsumer->context, 0, device))) {
      (status = cuCtxCreate(&cudaConsumer->context, 0, device))) {
        printf("failed to create CUDA context\n");
        return status;
    }
    int  major = 0, minor = 0;
    char deviceName[256];
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
-                       device);
+    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
                       device);
    cuDeviceGetName(deviceName, 256, device);
-  printf(
+    printf("CUDA Consumer on GPU Device %d: \"%s\" with compute capability "
      "CUDA Consumer on GPU Device %d: \"%s\" with compute capability "
           "%d.%d\n\n",
-      device, deviceName, major, minor);
+           device,
           deviceName,
           major,
           minor);
    cuCtxPopCurrent(&cudaConsumer->context);
    if (major < 6) {
-    printf(
+        printf("EGLStream_CUDA_CrossGPU requires SM 6.0 or higher arch GPU.  "
        "EGLStream_CUDA_CrossGPU requires SM 6.0 or higher arch GPU.  "
               "Exiting...\n");
        exit(2); // EXIT_WAIVED
    }
@ -225,8 +230,8 @@ CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) {
    return status;
 }
-CUresult cuda_consumer_init(test_cuda_consumer_s *cudaConsumer,
+CUresult cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args)
-                            TestArgs *args) {
+{
    CUresult status = CUDA_SUCCESS;
    int      bufferSize;
@ -250,7 +255,8 @@ done:
    return status;
 }
-CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer) {
+CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer)
 {
    if (cudaConsumer->pCudaCopyMem) {
        free(cudaConsumer->pCudaCopyMem);
    }
--- a/Show More
+++ b/Show More
`@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d`
	`Make sure the dependencies mentioned in [Dependencies]() section above are installed.`	`Make sure the dependencies mentioned in [Dependencies]() section above are installed.`

	`## References (for more details)`	`## References (for more details)`
`@ -30,4 +30,3 @@ cuMemcpyDtoH, cuDeviceCanAccessPeer, cuModuleGetFunction, cuMemSetAccess, cuMemR`
	`Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.`	`Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.`

	`## References (for more details)`	`## References (for more details)`
`@ -30,4 +30,3 @@ cudaGetDeviceCount, cudaDeviceGetAttribute`
	`Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.`	`Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.`

	`## References (for more details)`	`## References (for more details)`
`@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d`
	`Make sure the dependencies mentioned in [Dependencies]() section above are installed.`	`Make sure the dependencies mentioned in [Dependencies]() section above are installed.`

	`## References (for more details)`	`## References (for more details)`